{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "os.chdir(\"E:\\pythonstudy\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第一部分：数据预览"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1918771225</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1502284248</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3044012</td>\n",
       "      <td>2529072432</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3044012</td>\n",
       "      <td>3072478280</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1390707377</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp  interested  \\\n",
       "0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00           1   \n",
       "3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "\n",
       "   not_interested  \n",
       "0               0  \n",
       "1               0  \n",
       "2               0  \n",
       "3               0  \n",
       "4               0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2877501688</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1776192</td>\n",
       "      <td>3025444328</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1776192</td>\n",
       "      <td>4078218285</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1776192</td>\n",
       "      <td>1024025121</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2972428928</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:21.985000+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp\n",
       "0  1776192  2877501688        0  2012-11-30 11:39:01.230000+00:00\n",
       "1  1776192  3025444328        0  2012-11-30 11:39:01.230000+00:00\n",
       "2  1776192  4078218285        0  2012-11-30 11:39:01.230000+00:00\n",
       "3  1776192  1024025121        0  2012-11-30 11:39:01.230000+00:00\n",
       "4  1776192  2972428928        0  2012-11-30 11:39:21.985000+00:00"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv(\"test.csv\")\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>locale</th>\n",
       "      <th>birthyear</th>\n",
       "      <th>gender</th>\n",
       "      <th>joinedAt</th>\n",
       "      <th>location</th>\n",
       "      <th>timezone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3197468391</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1993</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-10-02T06:40:55.524Z</td>\n",
       "      <td>Medan  Indonesia</td>\n",
       "      <td>480.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3537982273</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1992</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-09-29T18:03:12.111Z</td>\n",
       "      <td>Medan  Indonesia</td>\n",
       "      <td>420.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>823183725</td>\n",
       "      <td>en_US</td>\n",
       "      <td>1975</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-10-06T03:14:07.149Z</td>\n",
       "      <td>Stratford  Ontario</td>\n",
       "      <td>-240.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1872223848</td>\n",
       "      <td>en_US</td>\n",
       "      <td>1991</td>\n",
       "      <td>female</td>\n",
       "      <td>2012-11-04T08:59:43.783Z</td>\n",
       "      <td>Tehran  Iran</td>\n",
       "      <td>210.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3429017717</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1995</td>\n",
       "      <td>female</td>\n",
       "      <td>2012-09-10T16:06:53.132Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>420.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id locale birthyear  gender                  joinedAt  \\\n",
       "0  3197468391  id_ID      1993    male  2012-10-02T06:40:55.524Z   \n",
       "1  3537982273  id_ID      1992    male  2012-09-29T18:03:12.111Z   \n",
       "2   823183725  en_US      1975    male  2012-10-06T03:14:07.149Z   \n",
       "3  1872223848  en_US      1991  female  2012-11-04T08:59:43.783Z   \n",
       "4  3429017717  id_ID      1995  female  2012-09-10T16:06:53.132Z   \n",
       "\n",
       "             location  timezone  \n",
       "0    Medan  Indonesia     480.0  \n",
       "1    Medan  Indonesia     420.0  \n",
       "2  Stratford  Ontario    -240.0  \n",
       "3        Tehran  Iran     210.0  \n",
       "4                 NaN     420.0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " \"\"\"\n",
    "用户描述信息在users.csv文件：共7维特征\n",
    "user_id\n",
    "locale：地区，语言\n",
    "birthyear：出身年\n",
    "gender：性别\n",
    "joinedAt：用户加入APP的时间，ISO-8601 UTC time\n",
    "location：地点\n",
    "timezone：时区\n",
    " \"\"\"\n",
    "\n",
    "#读取数据\n",
    "users = pd.read_csv(\"users.csv\")\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " \"\"\"\n",
    "活动描述信息在events.csv文件：共110维特征\n",
    "前9列：event_id, user_id, start_time, city, state, zip, country, lat, and lng.\n",
    "event_id：id of the event, \n",
    "user_id：id of the user who created the event.  \n",
    "city, state, zip, and country： more details about the location of the venue (if known).\n",
    "lat and lng： floats（latitude and longitude coordinates of the venue）\n",
    "start_time： 字符串，ISO-8601 UTC time，表示活动开始时间\n",
    "\n",
    "后101列为词频：count_1, count_2, ..., count_100，count_other\n",
    "count_N：活动描述出现第N个词的次数\n",
    "count_other：除了最常用的100个词之外的其余词出现的次数\n",
    " \"\"\"\n",
    "\n",
    "#读取数据\n",
    "events = pd.read_csv(\"events.csv\")\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event</th>\n",
       "      <th>yes</th>\n",
       "      <th>maybe</th>\n",
       "      <th>invited</th>\n",
       "      <th>no</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1159822043</td>\n",
       "      <td>1975964455 252302513 4226086795 3805886383 142...</td>\n",
       "      <td>2733420590 517546982 1350834692 532087573 5831...</td>\n",
       "      <td>1723091036 3795873583 4109144917 3560622906 31...</td>\n",
       "      <td>3575574655 1077296663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>686467261</td>\n",
       "      <td>2394228942 2686116898 1056558062 3792942231 41...</td>\n",
       "      <td>1498184352 645689144 3770076778 331335845 4239...</td>\n",
       "      <td>1788073374 733302094 1830571649 676508092 7081...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1186208412</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3320380166 3810793697</td>\n",
       "      <td>1379121209 440668682</td>\n",
       "      <td>1728988561 2950720854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2621578336</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>855842686</td>\n",
       "      <td>2406118796 3550897984 294255260 1125817077 109...</td>\n",
       "      <td>2671721559 1761448345 2356975806 2666669465 10...</td>\n",
       "      <td>1518670705 880919237 2326414227 2673818347 332...</td>\n",
       "      <td>3500235232</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        event                                                yes  \\\n",
       "0  1159822043  1975964455 252302513 4226086795 3805886383 142...   \n",
       "1   686467261  2394228942 2686116898 1056558062 3792942231 41...   \n",
       "2  1186208412                                                NaN   \n",
       "3  2621578336                                                NaN   \n",
       "4   855842686  2406118796 3550897984 294255260 1125817077 109...   \n",
       "\n",
       "                                               maybe  \\\n",
       "0  2733420590 517546982 1350834692 532087573 5831...   \n",
       "1  1498184352 645689144 3770076778 331335845 4239...   \n",
       "2                              3320380166 3810793697   \n",
       "3                                                NaN   \n",
       "4  2671721559 1761448345 2356975806 2666669465 10...   \n",
       "\n",
       "                                             invited                     no  \n",
       "0  1723091036 3795873583 4109144917 3560622906 31...  3575574655 1077296663  \n",
       "1  1788073374 733302094 1830571649 676508092 7081...                    NaN  \n",
       "2                               1379121209 440668682  1728988561 2950720854  \n",
       "3                                                NaN                    NaN  \n",
       "4  1518670705 880919237 2326414227 2673818347 332...             3500235232  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " \"\"\"\n",
    "event_attendees.csv文件：共5维特征\n",
    "event_id：活动ID\n",
    "yes, maybe, invited, and no：以空格隔开的用户列表，\n",
    "分别表示该活动参加的用户、可能参加的用户，被邀请的用户和不参加的用户.\n",
    " \"\"\"\n",
    "\n",
    "#读取数据\n",
    "event_attendees = pd.read_csv(\"event_attendees.csv\")\n",
    "event_attendees.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>friends</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3197468391</td>\n",
       "      <td>1346449342 3873244116 4226080662 1222907620 54...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3537982273</td>\n",
       "      <td>1491560444 395798035 2036380346 899375619 3534...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>823183725</td>\n",
       "      <td>1484954627 1950387873 1652977611 4185960823 42...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1872223848</td>\n",
       "      <td>83361640 723814682 557944478 1724049724 253059...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3429017717</td>\n",
       "      <td>4253303705 2130310957 1838389374 3928735761 71...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         user                                            friends\n",
       "0  3197468391  1346449342 3873244116 4226080662 1222907620 54...\n",
       "1  3537982273  1491560444 395798035 2036380346 899375619 3534...\n",
       "2   823183725  1484954627 1950387873 1652977611 4185960823 42...\n",
       "3  1872223848  83361640 723814682 557944478 1724049724 253059...\n",
       "4  3429017717  4253303705 2130310957 1838389374 3928735761 71..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " \"\"\"\n",
    "user_friends.csv文件：共2维特征\n",
    "user：用户ID\n",
    "friends：以空格隔开的用户好友ID列表，\n",
    " \"\"\"\n",
    "\n",
    "#读取数据\n",
    "user_friends = pd.read_csv(\"user_friends.csv\")\n",
    "user_friends.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第二部分：提取只在train和test中出现的event"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'r')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'1062024228',\n",
       " '4140301246',\n",
       " '695149490',\n",
       " '693605333',\n",
       " '2867836267',\n",
       " '3049491942',\n",
       " '648394846',\n",
       " '1487659279',\n",
       " '2437956447',\n",
       " '3286516126',\n",
       " '1492911169',\n",
       " '2897294049',\n",
       " '1509619943',\n",
       " '2076836172',\n",
       " '4015512775',\n",
       " '3491268592',\n",
       " '1086560243',\n",
       " '3783327786',\n",
       " '2311542641',\n",
       " '2701433747',\n",
       " '3515520150',\n",
       " '1096627389',\n",
       " '97672504',\n",
       " '1210091631',\n",
       " '2129718710',\n",
       " '2324728487',\n",
       " '858707451',\n",
       " '3553950281',\n",
       " '666781614',\n",
       " '4286544397',\n",
       " '172405986',\n",
       " '3508849684',\n",
       " '205658920',\n",
       " '2306621524',\n",
       " '3633741296',\n",
       " '332179237',\n",
       " '1681914210',\n",
       " '3978034285',\n",
       " '2144300585',\n",
       " '106517927',\n",
       " '2023131804',\n",
       " '170930739',\n",
       " '1193557066',\n",
       " '4131245770',\n",
       " '2953099360',\n",
       " '2925251320',\n",
       " '1141817122',\n",
       " '2247712885',\n",
       " '3275948662',\n",
       " '507118096',\n",
       " '3905269676',\n",
       " '1528500344',\n",
       " '3432310055',\n",
       " '1752177173',\n",
       " '177129913',\n",
       " '3578863998',\n",
       " '3204104078',\n",
       " '3767960965',\n",
       " '1401073045',\n",
       " '1773363754',\n",
       " '3121696879',\n",
       " '1893629654',\n",
       " '1183309937',\n",
       " '3397017880',\n",
       " '1345381188',\n",
       " '2245953972',\n",
       " '1638625563',\n",
       " '62489876',\n",
       " '25570068',\n",
       " '2424242179',\n",
       " '923910543',\n",
       " '2982417413',\n",
       " '3962985904',\n",
       " '533904091',\n",
       " '559618821',\n",
       " '2291985041',\n",
       " '2716492883',\n",
       " '1672262744',\n",
       " '1908283633',\n",
       " '3403115736',\n",
       " '518815654',\n",
       " '3916858862',\n",
       " '263011441',\n",
       " '3716036142',\n",
       " '4133786107',\n",
       " '3287030905',\n",
       " '675216211',\n",
       " '2798291665',\n",
       " '3273774165',\n",
       " '2433903660',\n",
       " '1087635781',\n",
       " '3732169617',\n",
       " '2301827443',\n",
       " '3413075630',\n",
       " '1439099168',\n",
       " '3659977472',\n",
       " '1666413374',\n",
       " '3747717446',\n",
       " '1441829270',\n",
       " '3222393526',\n",
       " '613582690',\n",
       " '326944617',\n",
       " '803377467',\n",
       " '3561137432',\n",
       " '2132445355',\n",
       " '1617947886',\n",
       " '921689165',\n",
       " '2107081189',\n",
       " '2401798322',\n",
       " '964696705',\n",
       " '3264734489',\n",
       " '417123290',\n",
       " '28242790',\n",
       " '2415054310',\n",
       " '3849412776',\n",
       " '662043969',\n",
       " '3253907448',\n",
       " '2056684630',\n",
       " '2239025769',\n",
       " '700247942',\n",
       " '1327015206',\n",
       " '1558280207',\n",
       " '1718800254',\n",
       " '767782490',\n",
       " '1644685632',\n",
       " '4063332351',\n",
       " '89901423',\n",
       " '2237504401',\n",
       " '695533465',\n",
       " '2949402180',\n",
       " '2304610871',\n",
       " '299139894',\n",
       " '3626560268',\n",
       " '1048603511',\n",
       " '746576079',\n",
       " '3251235057',\n",
       " '4059184428',\n",
       " '859536802',\n",
       " '931943556',\n",
       " '3349548561',\n",
       " '1192048526',\n",
       " '3487526512',\n",
       " '4270139223',\n",
       " '1783630773',\n",
       " '2038842201',\n",
       " '795973226',\n",
       " '2289214808',\n",
       " '73512801',\n",
       " '4262188500',\n",
       " '3354141479',\n",
       " '67648066',\n",
       " '110700006',\n",
       " '1684415115',\n",
       " '1497910534',\n",
       " '2997290586',\n",
       " '3317835950',\n",
       " '3809972731',\n",
       " '3948784617',\n",
       " '2196514887',\n",
       " '493625429',\n",
       " '3188133088',\n",
       " '3958632725',\n",
       " '2062854199',\n",
       " '414842182',\n",
       " '3248049500',\n",
       " '737340986',\n",
       " '2908519477',\n",
       " '4211398584',\n",
       " '4040837426',\n",
       " '1054917549',\n",
       " '4293596113',\n",
       " '3048578499',\n",
       " '3171268449',\n",
       " '3879292193',\n",
       " '2179936892',\n",
       " '620028937',\n",
       " '1943679456',\n",
       " '2530774373',\n",
       " '4034103347',\n",
       " '1443810444',\n",
       " '4241740470',\n",
       " '2441556923',\n",
       " '3319506135',\n",
       " '1559258260',\n",
       " '2130393831',\n",
       " '4199792713',\n",
       " '4166292492',\n",
       " '3212079634',\n",
       " '2416826549',\n",
       " '453336060',\n",
       " '2428635671',\n",
       " '3395589700',\n",
       " '3864600939',\n",
       " '4201530293',\n",
       " '4202270768',\n",
       " '1616502786',\n",
       " '1454534917',\n",
       " '3388135408',\n",
       " '1692359541',\n",
       " '3867828546',\n",
       " '3826292515',\n",
       " '2403467034',\n",
       " '1478537984',\n",
       " '3823518713',\n",
       " '1500592227',\n",
       " '2457973567',\n",
       " '4058829330',\n",
       " '1497739820',\n",
       " '2497077318',\n",
       " '2969035497',\n",
       " '1089632536',\n",
       " '2123370960',\n",
       " '3347318178',\n",
       " '448676621',\n",
       " '2375103731',\n",
       " '1277292991',\n",
       " '816384882',\n",
       " '3269751206',\n",
       " '2747347526',\n",
       " '1023138400',\n",
       " '805562777',\n",
       " '979018642',\n",
       " '3966531125',\n",
       " '4218422552',\n",
       " '392166594',\n",
       " '3005193781',\n",
       " '4084837460',\n",
       " '2944816123',\n",
       " '366912974',\n",
       " '2165957355',\n",
       " '1425625542',\n",
       " '2409076523',\n",
       " '480972362',\n",
       " '3485195034',\n",
       " '2517281835',\n",
       " '4273898950',\n",
       " '4154074057',\n",
       " '1820083761',\n",
       " '858587465',\n",
       " '1841631859',\n",
       " '254691415',\n",
       " '3080365089',\n",
       " '1070962073',\n",
       " '1857156364',\n",
       " '1257844994',\n",
       " '3400896624',\n",
       " '840735277',\n",
       " '2468957580',\n",
       " '3577652785',\n",
       " '3538261110',\n",
       " '491118104',\n",
       " '2473502708',\n",
       " '2247398820',\n",
       " '3949033351',\n",
       " '3401681420',\n",
       " '1494587743',\n",
       " '2196568626',\n",
       " '1103896952',\n",
       " '3882708104',\n",
       " '4250784104',\n",
       " '2585678579',\n",
       " '1673873496',\n",
       " '708004717',\n",
       " '1886452892',\n",
       " '68377012',\n",
       " '345576099',\n",
       " '3713495962',\n",
       " '2912638473',\n",
       " '186373638',\n",
       " '1746528134',\n",
       " '1152772826',\n",
       " '3899715231',\n",
       " '2978312627',\n",
       " '1057214819',\n",
       " '3787413647',\n",
       " '2072616125',\n",
       " '3659345285',\n",
       " '2735190761',\n",
       " '72149297',\n",
       " '2085933053',\n",
       " '858379964',\n",
       " '3327757331',\n",
       " '1005830738',\n",
       " '1588485860',\n",
       " '2082195127',\n",
       " '1139465813',\n",
       " '3928351581',\n",
       " '1426337764',\n",
       " '2020941968',\n",
       " '303459881',\n",
       " '1521445358',\n",
       " '4105510328',\n",
       " '2066735480',\n",
       " '1338116650',\n",
       " '4289284578',\n",
       " '2184947879',\n",
       " '187728438',\n",
       " '2057068023',\n",
       " '4201550847',\n",
       " '463936882',\n",
       " '2241349126',\n",
       " '2480319760',\n",
       " '3930422129',\n",
       " '736850175',\n",
       " '2293975486',\n",
       " '4013950788',\n",
       " '2704757206',\n",
       " '2828291365',\n",
       " '2774741890',\n",
       " '1347934681',\n",
       " '1307632801',\n",
       " '4290944875',\n",
       " '72461116',\n",
       " '3996963187',\n",
       " '697674217',\n",
       " '4142448875',\n",
       " '1179672480',\n",
       " '2811628790',\n",
       " '4249979866',\n",
       " '920467258',\n",
       " '217706704',\n",
       " '3005548418',\n",
       " '2322818360',\n",
       " '775641004',\n",
       " '1586611723',\n",
       " '2977769484',\n",
       " '878944665',\n",
       " '2414358105',\n",
       " '2969829668',\n",
       " '2716503088',\n",
       " '1867946088',\n",
       " '1093844070',\n",
       " '811590455',\n",
       " '2771523560',\n",
       " '2124576111',\n",
       " '2280892735',\n",
       " '1380051674',\n",
       " '3732079142',\n",
       " '3770815990',\n",
       " '2404898033',\n",
       " '1549252317',\n",
       " '168005957',\n",
       " '1549366784',\n",
       " '1247327810',\n",
       " '82290709',\n",
       " '379197377',\n",
       " '3653941806',\n",
       " '3478474611',\n",
       " '1878997696',\n",
       " '1827283730',\n",
       " '1462150899',\n",
       " '945248517',\n",
       " '959843439',\n",
       " '407288007',\n",
       " '2491808242',\n",
       " '2690912457',\n",
       " '1294070735',\n",
       " '3684638133',\n",
       " '1818524861',\n",
       " '760916752',\n",
       " '1697930615',\n",
       " '172445691',\n",
       " '4142633382',\n",
       " '2360728862',\n",
       " '1641900461',\n",
       " '1776393554',\n",
       " '124989980',\n",
       " '2285963249',\n",
       " '1272938012',\n",
       " '3969985283',\n",
       " '2915662989',\n",
       " '1999509621',\n",
       " '4014449816',\n",
       " '771676713',\n",
       " '1545194399',\n",
       " '998267429',\n",
       " '3708234991',\n",
       " '3428512583',\n",
       " '2170086551',\n",
       " '1928727087',\n",
       " '2432115736',\n",
       " '877945383',\n",
       " '4244463632',\n",
       " '3986920873',\n",
       " '1491066879',\n",
       " '1879190850',\n",
       " '3867936283',\n",
       " '2201282509',\n",
       " '1359556127',\n",
       " '2129137851',\n",
       " '3777549242',\n",
       " '240233220',\n",
       " '1329952040',\n",
       " '1461387354',\n",
       " '3851466541',\n",
       " '1306338110',\n",
       " '1701670121',\n",
       " '3075503747',\n",
       " '3842362115',\n",
       " '3948162966',\n",
       " '2584113432',\n",
       " '622120837',\n",
       " '1023462832',\n",
       " '2905482756',\n",
       " '447655182',\n",
       " '567527059',\n",
       " '1200696517',\n",
       " '3711929027',\n",
       " '2678142390',\n",
       " '704244113',\n",
       " '1977909542',\n",
       " '41623329',\n",
       " '2788751731',\n",
       " '3483869180',\n",
       " '3902750267',\n",
       " '3293166549',\n",
       " '1641436512',\n",
       " '2374444123',\n",
       " '230148747',\n",
       " '3875424137',\n",
       " '739733335',\n",
       " '2849940433',\n",
       " '2798432918',\n",
       " '1819771813',\n",
       " '4011082942',\n",
       " '4205761284',\n",
       " '3188394122',\n",
       " '2113532094',\n",
       " '1675237338',\n",
       " '2211704221',\n",
       " '4053956936',\n",
       " '3083830288',\n",
       " '858488892',\n",
       " '55793348',\n",
       " '693206007',\n",
       " '1448107006',\n",
       " '1438413864',\n",
       " '2692153353',\n",
       " '627338344',\n",
       " '3418426127',\n",
       " '447529362',\n",
       " '3853447857',\n",
       " '4265122277',\n",
       " '3538439402',\n",
       " '1403176651',\n",
       " '3696762643',\n",
       " '789146264',\n",
       " '3453997003',\n",
       " '3175590309',\n",
       " '2234892318',\n",
       " '1299206322',\n",
       " '1309574084',\n",
       " '526306322',\n",
       " '2808951120',\n",
       " '65743991',\n",
       " '1845469953',\n",
       " '3269343017',\n",
       " '999643248',\n",
       " '3667618471',\n",
       " '1568276339',\n",
       " '3223228633',\n",
       " '1749018134',\n",
       " '905634841',\n",
       " '1922434523',\n",
       " '224435617',\n",
       " '3410736122',\n",
       " '3944752179',\n",
       " '2756918554',\n",
       " '1564670110',\n",
       " '647011225',\n",
       " '2338670339',\n",
       " '3192244285',\n",
       " '1811836231',\n",
       " '2238892602',\n",
       " '760705045',\n",
       " '730719446',\n",
       " '3221372597',\n",
       " '1211578449',\n",
       " '1696929787',\n",
       " '1487823750',\n",
       " '3398901860',\n",
       " '428097180',\n",
       " '2553047797',\n",
       " '154434302',\n",
       " '188428993',\n",
       " '2678255240',\n",
       " '4010465283',\n",
       " '3417522749',\n",
       " '1009755933',\n",
       " '4071751701',\n",
       " '804412705',\n",
       " '3384785057',\n",
       " '4238643515',\n",
       " '3560570011',\n",
       " '2191615827',\n",
       " '491131434',\n",
       " '3315807711',\n",
       " '3508551200',\n",
       " '2567519500',\n",
       " '971510596',\n",
       " '2492433817',\n",
       " '3110363670',\n",
       " '4186712466',\n",
       " '4099353109',\n",
       " '892071473',\n",
       " '8816449',\n",
       " '3446902030',\n",
       " '678132679',\n",
       " '2082259770',\n",
       " '3368128234',\n",
       " '1090410712',\n",
       " '3598552560',\n",
       " '1233165674',\n",
       " '512403156',\n",
       " '1490557433',\n",
       " '3451292933',\n",
       " '2974411380',\n",
       " '3781961564',\n",
       " '2093319113',\n",
       " '2329324806',\n",
       " '2360970554',\n",
       " '875838968',\n",
       " '1241270051',\n",
       " '2708243586',\n",
       " '3968521389',\n",
       " '86549210',\n",
       " '1783442779',\n",
       " '1260659849',\n",
       " '1075717146',\n",
       " '1916422039',\n",
       " '2766764102',\n",
       " '893326926',\n",
       " '1522480247',\n",
       " '1565715575',\n",
       " '1431416915',\n",
       " '2215474035',\n",
       " '1214819792',\n",
       " '1817785599',\n",
       " '744858144',\n",
       " '14945235',\n",
       " '3921014619',\n",
       " '1620415785',\n",
       " '3420605744',\n",
       " '2456385399',\n",
       " '1636467034',\n",
       " '739161736',\n",
       " '1497621048',\n",
       " '2853884485',\n",
       " '1506378274',\n",
       " '3216168543',\n",
       " '1222992905',\n",
       " '2148387768',\n",
       " '2020022404',\n",
       " '2584972286',\n",
       " '37290993',\n",
       " '2613114347',\n",
       " '2891782769',\n",
       " '3007412705',\n",
       " '4000101630',\n",
       " '551895988',\n",
       " '4038388081',\n",
       " '2776566965',\n",
       " '927695082',\n",
       " '2695404171',\n",
       " '1601281862',\n",
       " '3480624055',\n",
       " '1631274105',\n",
       " '706200405',\n",
       " '888120649',\n",
       " '1389928885',\n",
       " '3810748556',\n",
       " '1582367689',\n",
       " '2796416513',\n",
       " '3572977792',\n",
       " '2392886869',\n",
       " '600512901',\n",
       " '3559606266',\n",
       " '1581474492',\n",
       " '1837633909',\n",
       " '1707209495',\n",
       " '3171398290',\n",
       " '2955014962',\n",
       " '849041503',\n",
       " '1901024297',\n",
       " '1690154685',\n",
       " '987194908',\n",
       " '3162919464',\n",
       " '2986213038',\n",
       " '2219919748',\n",
       " '3487719716',\n",
       " '2002146936',\n",
       " '799356822',\n",
       " '2251550828',\n",
       " '4194142958',\n",
       " '3422533807',\n",
       " '529900443',\n",
       " '3653306306',\n",
       " '2590444754',\n",
       " '1236148406',\n",
       " '772228233',\n",
       " '2706390147',\n",
       " '2679521343',\n",
       " '1275435478',\n",
       " '1374330612',\n",
       " '2500726913',\n",
       " '3329773282',\n",
       " '1159701801',\n",
       " '1463617028',\n",
       " '1525541504',\n",
       " '879365424',\n",
       " '3969026856',\n",
       " '292032212',\n",
       " '1633364082',\n",
       " '1379372643',\n",
       " '1985340464',\n",
       " '3942861851',\n",
       " '856365940',\n",
       " '573293150',\n",
       " '1730523109',\n",
       " '3697638490',\n",
       " '1281108322',\n",
       " '337389135',\n",
       " '2131379889',\n",
       " '301084021',\n",
       " '181219102',\n",
       " '2380117374',\n",
       " '450093935',\n",
       " '1299853275',\n",
       " '2809607334',\n",
       " '3344451900',\n",
       " '3113793753',\n",
       " '3388271263',\n",
       " '1369208954',\n",
       " '2024105424',\n",
       " '2263960915',\n",
       " '2662849125',\n",
       " '749859087',\n",
       " '191661856',\n",
       " '1405185826',\n",
       " '4097301015',\n",
       " '4280025323',\n",
       " '3010455159',\n",
       " '2163400929',\n",
       " '4059679210',\n",
       " '58273774',\n",
       " '2039242702',\n",
       " '1421210499',\n",
       " '3989198693',\n",
       " '11608112',\n",
       " '2642707561',\n",
       " '434543874',\n",
       " '2922265580',\n",
       " '1221193633',\n",
       " '2859386475',\n",
       " '473531767',\n",
       " '4177290981',\n",
       " '2888757174',\n",
       " '1445028465',\n",
       " '3755202160',\n",
       " '1510219719',\n",
       " '4026815644',\n",
       " '1499053881',\n",
       " '174639232',\n",
       " '3463560786',\n",
       " '703370904',\n",
       " '3771772284',\n",
       " '88377811',\n",
       " '416081328',\n",
       " '3360357269',\n",
       " '2266281591',\n",
       " '3128414020',\n",
       " '590998091',\n",
       " '1672345363',\n",
       " '2477995738',\n",
       " '1639958590',\n",
       " '1669604733',\n",
       " '3114306284',\n",
       " '4289340738',\n",
       " '571271773',\n",
       " '1110856401',\n",
       " '1587441682',\n",
       " '3243313717',\n",
       " '236692540',\n",
       " '2925844380',\n",
       " '3297564047',\n",
       " '2438434714',\n",
       " '3284103018',\n",
       " '3492796375',\n",
       " '778468524',\n",
       " '2063466570',\n",
       " '2695898547',\n",
       " '4177226936',\n",
       " '534708903',\n",
       " '4030786582',\n",
       " '9032213',\n",
       " '2444038166',\n",
       " '1630489177',\n",
       " '3964060639',\n",
       " '1406619711',\n",
       " '3091794905',\n",
       " '3871210244',\n",
       " '2920293801',\n",
       " '218663179',\n",
       " '3874488615',\n",
       " '3457399351',\n",
       " '180901205',\n",
       " '2434392105',\n",
       " '1991938600',\n",
       " '944759264',\n",
       " '3561051213',\n",
       " '2411303936',\n",
       " '2821117540',\n",
       " '1134192019',\n",
       " '263279656',\n",
       " '231088647',\n",
       " '3034550941',\n",
       " '2886354319',\n",
       " '2089971323',\n",
       " '619689337',\n",
       " '3517746814',\n",
       " '2054614790',\n",
       " '2203480307',\n",
       " '2612400653',\n",
       " '4156355007',\n",
       " '1113038458',\n",
       " '2430713742',\n",
       " '1764172149',\n",
       " '3477077952',\n",
       " '494878884',\n",
       " '2951156726',\n",
       " '140337530',\n",
       " '634508293',\n",
       " '851749287',\n",
       " '626269042',\n",
       " '3438779574',\n",
       " '3708625220',\n",
       " '380501274',\n",
       " '919381911',\n",
       " '3254529235',\n",
       " '1338321138',\n",
       " '2633637786',\n",
       " '3610583418',\n",
       " '2724103445',\n",
       " '405428132',\n",
       " '1711604318',\n",
       " '3380687062',\n",
       " '682423835',\n",
       " '1256273578',\n",
       " '2621889209',\n",
       " '3680303526',\n",
       " '1604093765',\n",
       " '3779036237',\n",
       " '410386599',\n",
       " '1432813792',\n",
       " '1101073006',\n",
       " '411244492',\n",
       " '1387920247',\n",
       " '2589917271',\n",
       " '3858844131',\n",
       " '4176588019',\n",
       " '1598382429',\n",
       " '3328077853',\n",
       " '3028859957',\n",
       " '1089819483',\n",
       " '436592000',\n",
       " '808122386',\n",
       " '273094339',\n",
       " '3541811987',\n",
       " '3470999302',\n",
       " '2322243932',\n",
       " '2146032671',\n",
       " '95076039',\n",
       " '1901601017',\n",
       " '2012506795',\n",
       " '3418302627',\n",
       " '351741784',\n",
       " '3243169434',\n",
       " '464565286',\n",
       " '1466293065',\n",
       " '510790020',\n",
       " '3764710330',\n",
       " '3350877007',\n",
       " '3083345340',\n",
       " '1634994103',\n",
       " '1426073917',\n",
       " '4188214514',\n",
       " '3141008702',\n",
       " '115584728',\n",
       " '2613386532',\n",
       " '3668560930',\n",
       " '1085259815',\n",
       " '2648833412',\n",
       " '3493348543',\n",
       " '1188805929',\n",
       " '940916476',\n",
       " '1965000814',\n",
       " '1587116147',\n",
       " '3397593388',\n",
       " '2998423233',\n",
       " '1615291811',\n",
       " '2064772706',\n",
       " '3712070556',\n",
       " '2987292356',\n",
       " '1120182267',\n",
       " '1892440073',\n",
       " '1460863346',\n",
       " '105698575',\n",
       " '1508535164',\n",
       " '2824762134',\n",
       " '264729502',\n",
       " '1784460112',\n",
       " '826943401',\n",
       " '2865586777',\n",
       " '1075725585',\n",
       " '178719942',\n",
       " '3253541195',\n",
       " '2628680570',\n",
       " '3007406784',\n",
       " '2769570008',\n",
       " '3177059558',\n",
       " '1486542415',\n",
       " '406602676',\n",
       " '268669105',\n",
       " '3358028399',\n",
       " '970700766',\n",
       " '155266309',\n",
       " '3036660288',\n",
       " '3563635172',\n",
       " '1103665416',\n",
       " '2967210072',\n",
       " '1587393206',\n",
       " '1077364719',\n",
       " '3656009265',\n",
       " '4068367333',\n",
       " '425366021',\n",
       " '3551837862',\n",
       " '3080864770',\n",
       " '35091389',\n",
       " '2540229116',\n",
       " '2592239355',\n",
       " '1561139627',\n",
       " '3697359735',\n",
       " '4112286860',\n",
       " '2808933304',\n",
       " '1749744109',\n",
       " '1218174058',\n",
       " '1619676916',\n",
       " '4055349094',\n",
       " '450183377',\n",
       " '1106103765',\n",
       " '1602727924',\n",
       " '1379777527',\n",
       " '4041957671',\n",
       " '4103800848',\n",
       " '669142889',\n",
       " '735155031',\n",
       " '521858421',\n",
       " '4091610635',\n",
       " '1172667758',\n",
       " '524691289',\n",
       " '811345080',\n",
       " '3338163812',\n",
       " '2219392559',\n",
       " '1683297114',\n",
       " '224483924',\n",
       " '3034266247',\n",
       " '719385321',\n",
       " '1810485930',\n",
       " '3703220574',\n",
       " '156494494',\n",
       " '3982673935',\n",
       " '746497547',\n",
       " '3715414633',\n",
       " '2849228266',\n",
       " '183271526',\n",
       " '1655934860',\n",
       " '2374384994',\n",
       " '2851440006',\n",
       " '2396566946',\n",
       " '1542105098',\n",
       " '321116395',\n",
       " '726631277',\n",
       " '3216854901',\n",
       " '2977586904',\n",
       " '2742780288',\n",
       " '3174369333',\n",
       " '1532429458',\n",
       " '1573329057',\n",
       " '3324533213',\n",
       " '4178800987',\n",
       " '2107071578',\n",
       " '2263436179',\n",
       " '2051111588',\n",
       " '2994891113',\n",
       " '407638004',\n",
       " '3106536799',\n",
       " '3735952238',\n",
       " '2649802535',\n",
       " '2506160720',\n",
       " '977021682',\n",
       " '3021201850',\n",
       " '2323745666',\n",
       " '65978976',\n",
       " '949540452',\n",
       " '3321054057',\n",
       " '2942809579',\n",
       " '1188335100',\n",
       " '2740101266',\n",
       " '4261753049',\n",
       " '729305487',\n",
       " '1767325091',\n",
       " '810618772',\n",
       " '4153312619',\n",
       " '3251813967',\n",
       " '1155431373',\n",
       " '3524314596',\n",
       " '1314114792',\n",
       " '3573523098',\n",
       " '1287148023',\n",
       " '2825765070',\n",
       " '711611185',\n",
       " '2693141741',\n",
       " '1610171038',\n",
       " '2120604435',\n",
       " '1268088620',\n",
       " '2252007144',\n",
       " '3650881117',\n",
       " '736236994',\n",
       " '446911604',\n",
       " '3875231517',\n",
       " '324581869',\n",
       " '2995369168',\n",
       " '1073807149',\n",
       " '4102628773',\n",
       " '1407157250',\n",
       " '855637924',\n",
       " '2324717272',\n",
       " '108743814',\n",
       " '1972106630',\n",
       " '1255629030',\n",
       " '3499523914',\n",
       " '3200110310',\n",
       " '1975529948',\n",
       " '2708587571',\n",
       " '4083904591',\n",
       " '1611732042',\n",
       " '1512723504',\n",
       " '1693024677',\n",
       " '1326595643',\n",
       " '1728535880',\n",
       " '2780066048',\n",
       " '1615385128',\n",
       " '101780618',\n",
       " '1481015257',\n",
       " '4273848145',\n",
       " '3385455424',\n",
       " '468069570',\n",
       " '427450245',\n",
       " '3138246349',\n",
       " '15160790',\n",
       " '1134231092',\n",
       " '162437481',\n",
       " '3411275264',\n",
       " '2898528659',\n",
       " '1744265468',\n",
       " '919376574',\n",
       " '3202299499',\n",
       " '3769818139',\n",
       " '1458906695',\n",
       " '1269832600',\n",
       " '1281252091',\n",
       " '436651588',\n",
       " '1103713679',\n",
       " '2925964144',\n",
       " '3758162883',\n",
       " '756986165',\n",
       " '3420542143',\n",
       " '580186719',\n",
       " '2168332178',\n",
       " '4005039699',\n",
       " '3427435191',\n",
       " '3245840256',\n",
       " '2383225522',\n",
       " '3231677093',\n",
       " '2219144892',\n",
       " '1811629670',\n",
       " '640279319',\n",
       " '1745350497',\n",
       " '1205391785',\n",
       " '973265170',\n",
       " '4093219541',\n",
       " '2470001152',\n",
       " '1740282438',\n",
       " '40744964',\n",
       " '2309977061',\n",
       " '3424643087',\n",
       " '4106186736',\n",
       " '188505465',\n",
       " '1859112770',\n",
       " '1320937087',\n",
       " ...}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniqueEvents #此部分即为只在测试集和训练集中同时出现的活动"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "uniqueEvents=list(uniqueEvents)\n",
    "pd.to_numeric(uniqueEvents)\n",
    "c={\"event_id\" : uniqueEvents}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pandas.core.frame import DataFrame\n",
    "uniqueEvents=DataFrame(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "uniqueEvents.to_csv('hh.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "uniqueEvents=pd.read_csv('hh.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Unnamed: 0    int64\n",
       "event_id      int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniqueEvents.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "newevent=pd.merge(events,uniqueEvents,on ='event_id',how = 'inner')#用events匹配在train和test中出现的event"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>3738</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>11183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>4613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>10086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2146</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1     ...      c_93  c_94  c_95  c_96  c_97  c_98  c_99  c_100  \\\n",
       "0  NaN  NaN    2     ...         1     0     0     0     0     0     0      0   \n",
       "1  NaN  NaN    2     ...         0     0     0     0     0     0     0      0   \n",
       "2  NaN  NaN    0     ...         0     0     0     0     0     0     0      0   \n",
       "3  NaN  NaN    1     ...         0     0     0     0     0     0     0      0   \n",
       "4  NaN  NaN    1     ...         0     0     0     0     0     0     0      0   \n",
       "\n",
       "   c_other  Unnamed: 0  \n",
       "0        9        3738  \n",
       "1        7       11183  \n",
       "2       12        4613  \n",
       "3        8       10086  \n",
       "4        9        2146  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newevent.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第三部分：进行聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#去掉不必要的字段\n",
    "train=newevent.drop(['event_id','user_id','start_time','city','state','zip','country','lat','lng','Unnamed: 0'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  c_other  \n",
       "0     0     0     0     0     0     0      0        9  \n",
       "1     0     0     0     0     0     0      0        7  \n",
       "2     0     0     0     0     0     0      0       12  \n",
       "3     0     0     0     0     0     0      0        8  \n",
       "4     0     0     0     0     0     0      0        9  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the shape of train_image: (13418, 101)\n"
     ]
    }
   ],
   "source": [
    "print('the shape of train_image: {}'.format(train.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 2)\n"
     ]
    }
   ],
   "source": [
    "#对数据进行PCA降维\n",
    "pca = PCA(n_components=0.98)\n",
    "pca.fit(train)\n",
    "\n",
    "train_pca = pca.transform(train)\n",
    "\n",
    "# 降维后的特征维数\n",
    "print(train_pca.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "def K_cluster_analysis(K, train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(train)\n",
    "    y_train_pred = mb_kmeans.fit_predict(train)\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(train,mb_kmeans.predict(train))\n",
    "\n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.41637785240586095, time elaps:7\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.29000510542485614, time elaps:5\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.16439624073147857, time elaps:5\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.13989752079892892, time elaps:5\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.13167658096910956, time elaps:5\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.11657775243360612, time elaps:6\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.0758079145370435, time elaps:6\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.09718427149532338, time elaps:5\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.08362286391708357, time elaps:5\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.07174336639630058, time elaps:6\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [10, 20, 30,40,50,60,70,80,80,100]\n",
    "CH_scores = []\n",
    "\n",
    "for K in Ks:\n",
    "    ch= K_cluster_analysis(K,train)\n",
    "    CH_scores.append(ch)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHzVJREFUeJzt3XmUVPWZ//H3w24ICkqzY0DFhQhC\nrOCSgBl+RuGooHPcMBqMRuKCqKDSyZg4wcmIQBSNxMCoaDBA0KhBE2Uc0CTGI9IMSGQbGlxoIdJx\nF1G25/fH93a6aBu6mq6uW1X38zqnDn1v3Vv9dJ3ic2/d5fmauyMiIsnQJO4CREQkdxT6IiIJotAX\nEUkQhb6ISIIo9EVEEkShLyKSIAp9EZEEUeiLiCSIQl9EJEGaxV1ATe3bt/cePXrEXYaISEFZunTp\nP9y9pK7l8i70e/ToQVlZWdxliIgUFDN7M5PldHhHRCRBFPoiIgmi0BcRSRCFvohIgij0RUQSRKEv\nIpIgCn0RkQQpmtD/4AP4yU9gzZq4KxERyV9FE/o7dsCUKTBpUtyViIjkr6IJ/ZISuOwyeOQRqKiI\nuxoRkfxUNKEPMG4c7N4NU6fGXYmISH4qqtDv2RMuuACmT4f334+7GhGR/FNUoQ8wfjx88gn88pdx\nVyIikn+KLvT79oWhQ+Huu2HbtrirERHJLxmFvpkNMbO1ZlZuZqX7WO5cM3MzS6XN+2G03lozOz0b\nRdeltBQqK2HmzFz8NhGRwlFn6JtZU2AaMBToDYwws961LNcGGAMsTpvXG7gQ+CowBPhl9HqNauBA\nOPFEmDwZdu5s7N8mIlI4MtnTHwCUu/sGd98OzAWG17LcbcAk4LO0ecOBue7+ubu/DpRHr9eozMKx\n/TfegEcfbezfJiJSODIJ/a7AxrTpimjeP5lZf6C7uz9d33Uby7BhcPTRcMcd4J6L3ygikv8yCX2r\nZd4/Y9TMmgB3AePqu27aa4wyszIzK6usrMygpLo1aQI33wyvvgoLFmTlJUVECl4moV8BdE+b7gZs\nSptuAxwLvGBmbwAnAvOjk7l1rQuAu89w95S7p0pK6hzXN2Pf+Q507Rr29kVEJLPQXwL0MrOeZtaC\ncGJ2ftWT7v6hu7d39x7u3gN4GRjm7mXRcheaWUsz6wn0Al7J+l+xFy1awNix8MILsHhxnYuLiBS9\nOkPf3XcCo4EFwGpgnruvNLMJZjasjnVXAvOAVcCzwDXuvqvhZWfuiiugXTvt7YuIAJjn2VnOVCrl\nZWVlWX3NH/8YfvYzWLUqnNwVESk2ZrbU3VN1LVd0d+TWZswYaNUqXLcvIpJkiQj9qrbLs2bB22/H\nXY2ISHwSEfpQ3Xb5rrvirkREJD6JCX21XRYRSVDoQ7hZS22XRSTJEhX6xx2ntssikmyJCn0IjdjU\ndllEkipxoT9oUGi7PGWK2i6LSPIkLvSr2i6//jo89ljc1YiI5FbiQh+q2y5PnKi2yyKSLIkM/fS2\ny//933FXIyKSO4kMfahuuzxxYtyViIjkTmJDX22XRSSJEhv6ENout22rtssikhyJDv02bWD0aHjy\nSVi7Nu5qREQaX6JDH+Daa6FlS7VdFpFkSHzod+gAl18Ov/612i6LSPFLfOhDddvlqVPjrkREpHEp\n9Kluu/yrX6ntsogUN4V+pKrt8n33xV2JiEjjySj0zWyIma01s3IzK63l+SvN7G9mttzMXjSz3tH8\nHma2LZq/3Mx+le0/IFuOOw6GDAmHeNR2WUSKVZ2hb2ZNgWnAUKA3MKIq1NPMdvc+7t4PmATcmfbc\nenfvFz2uzFbhjaG0NLRdfuihuCsREWkcmezpDwDK3X2Du28H5gLD0xdw94/SJlsDBdnGbNAgOOEE\ntV0WkeKVSeh3BTamTVdE8/ZgZteY2XrCnv6YtKd6mtkyM/uTmQ2s7ReY2SgzKzOzssrKynqUn11m\nYW9/wwa1XRaR4pRJ6Fst876wJ+/u09z9cGA8cEs0ezNwqLv3B8YCs83swFrWneHuKXdPlZSUZF59\nI6hqu3zHHWq7LCLFJ5PQrwC6p013AzbtY/m5wNkA7v65u78b/bwUWA8cuX+l5kZV2+Xly9V2WUSK\nTyahvwToZWY9zawFcCEwP30BM+uVNnkGsC6aXxKdCMbMDgN6ARuyUXhjqmq7rEZsIlJs6gx9d98J\njAYWAKuBee6+0swmmNmwaLHRZrbSzJYTDuOMjOYPAlaY2avAY8CV7v5e1v+KLKtqu/z88/DKK3FX\nIyKSPeZ5duA6lUp5WVlZ3GXw8cdw6KEweDD87ndxVyMism9mttTdU3Utpzty96JNG7jmGnjiCbVd\nFpHiodDfhzFj1HZZRIqLQn8fOnSAyy5T22URKR4K/TrceKPaLotI8VDo16FnTzj/fLVdFpHioNDP\nwPjxarssIsVBoZ+BqrbLd9+ttssiUtgU+hkqLYUtW+Dhh+OuRERk/yn0M1TVdnnyZLVdFpHCpdDP\nkFk4tr9hg+7QFZHCpdCvh+HD4aijYOJEtV0WkcKk0K+H9LbLzz0XdzUiIvWn0K+nqrbLEyfGXYmI\nSP0p9OupZUu44YbQdnnJkrirERGpH4X+fhg1Ctq21SArIlJ4FPr7oart8uOPq+2yiBQWhf5+qmq7\nPGVK3JWIiGROob+f0tsub9rXMPEiInlEod8A48aFu3PVdllECkVGoW9mQ8xsrZmVm1lpLc9faWZ/\nM7PlZvaimfVOe+6H0Xprzez0bBYft8MOgwsuCG2XP/gg7mpEROpWZ+ibWVNgGjAU6A2MSA/1yGx3\n7+Pu/YBJwJ3Rur2BC4GvAkOAX0avVzRuvjkMoq62yyJSCDLZ0x8AlLv7BnffDswFhqcv4O4fpU22\nBqqaFAwH5rr75+7+OlAevV7R6NcvtF2eOlVtl0Uk/2US+l2BjWnTFdG8PZjZNWa2nrCnP6Y+6xa6\n8ePVdllECkMmoW+1zPtCuzF3n+buhwPjgVvqs66ZjTKzMjMrq6yszKCk/HLKKWq7LCKFIZPQrwC6\np013A/Z1keJc4Oz6rOvuM9w95e6pkpKSDErKL2q7LCKFIpPQXwL0MrOeZtaCcGJ2fvoCZtYrbfIM\nYF3083zgQjNraWY9gV7AKw0vO/9UtV2+4w61XRaR/FVn6Lv7TmA0sABYDcxz95VmNsHMhkWLjTaz\nlWa2HBgLjIzWXQnMA1YBzwLXuPuuRvg7YlfVdnnZMrVdFpH8ZZ5nu6WpVMrLysriLmO/fP55uHb/\n6KNh4cK4qxGRJDGzpe6eqms53ZGbRS1bwtixsGiR2i6LSH5S6GdZVdtlDbIiIvlIoZ9lbdrA6NHw\nxBOwZk3c1YiI7Emh3wjGjIFWrcJ1+yIi+USh3whKSuDyy2HWLKioiLsaEZFqCv1GMm4c7N4Nd90V\ndyUiItUU+o2kRw+46CKYPh3eey/uakREAoV+I7r5Zti6Fe69N+5KREQChX4jOvZYOOssuOeeEP4i\nInFT6Dey0lJ491144IG4KxERUeg3upNPhoEDYcoU2LEj7mpEJOkU+jlQWgobN8KcOXFXIiJJp9DP\ngaFDoW/f0HZ59+64qxGRJFPo54BZ2NtftQqeeiruakQkyRT6OXLeedCzJ9x+uwZZEZH4KPRzpFkz\nuOkmWLwY/vznuKsRkaRS6OfQpZdChw5quywi8VHo59ABB8D118Ozz8Ly5XFXIyJJpNDPsauvhgMP\nDFfyiIjkmkI/xw46CK66CubNg/LyuKsRkaTJKPTNbIiZrTWzcjMrreX5sWa2ysxWmNlCM/tK2nO7\nzGx59JifzeIL1XXXQfPm4S5dEZFcqjP0zawpMA0YCvQGRphZ7xqLLQNS7t4XeAyYlPbcNnfvFz2G\nZanugta5czipO3MmbN4cdzUikiSZ7OkPAMrdfYO7bwfmAsPTF3D3593902jyZaBbdsssPjfeCDt3\nwt13x12JiCRJJqHfFdiYNl0Rzduby4Fn0qZbmVmZmb1sZmfvR41F6Ygjwg1b990HH34YdzUikhSZ\nhL7VMq/We0rN7GIgBaQPCX6ou6eAi4CpZnZ4LeuNijYMZZWVlRmUVBzGj4ePPgrBLyKSC5mEfgXQ\nPW26G7Cp5kJmdirwb8Awd/+8ar67b4r+3QC8APSvua67z3D3lLunSkpK6vUHFLL+/WHIkDCO7rZt\ncVcjIkmQSegvAXqZWU8zawFcCOxxFY6Z9QemEwJ/S9r8dmbWMvq5PfANYFW2ii8GpaWwZQs89FDc\nlYhIEtQZ+u6+ExgNLABWA/PcfaWZTTCzqqtxJgNfBh6tcWnmMUCZmb0KPA9MdHeFfppBg+DEE2Hy\n5HBiV0SkMZnnWcvHVCrlZWVlcZeRU7//PZx9NsyeDSNGxF2NiBQiM1sanT/dJ92RmwfOOguOOSY0\nYsuzbbCIFBmFfh5o0iRcybNiBTzzTN3Li4jsL4V+nrjoIujeXW2XRaRxKfTzRPPm4S7dv/wF/vrX\nuKsRkWKl0M8jl18Ohxyitssi0ngU+nmkdWsYMyYMnv7aa3FXIyLFSKGfZ665JoT/pEl1LysiUl8K\n/TxzyCEwalS4Zv+NN+KuRkSKjUI/D40dGy7j/PnP465ERIqNQj8PdesGl1wC998f+vKIiGSLQj9P\n3XQTfP45/OIXcVciIsVEoZ+njj4azjkH7r0XPv447mpEpFgo9PPY+PHwwQcwY0bclYhIsVDo57EB\nA2DwYLjzznCoR0SkoRT6ea60FDZtglmz4q5ERIqBQj/PnXoqHH98uFlr1664qxGRQqfQz3NmYW9/\n3Tp44om4qxGRQqfQLwDnnAO9emmQFRFpOIV+AWjaFG6+GZYuhYUL465GRAqZQr9AXHIJdO4Mt98e\ndyUiUsgyCn0zG2Jma82s3MxKa3l+rJmtMrMVZrbQzL6S9txIM1sXPUZms/gkadky9ORZtAheeSXu\nakSkUNUZ+mbWFJgGDAV6AyPMrHeNxZYBKXfvCzwGTIrWPRi4FTgBGADcambtsld+svzgB9C2rQZZ\nEZH9l8me/gCg3N03uPt2YC4wPH0Bd3/e3T+NJl8GukU/nw485+7vufv7wHPAkOyUnjxt2sDo0eEq\nnjVr4q5GRApRJqHfFdiYNl0Rzduby4Fn6rOumY0yszIzK6usrMygpOQaMwZatYLJk+OuREQKUSah\nb7XMq/XCQTO7GEgBVZGU0bruPsPdU+6eKikpyaCk5CopCWPpzpoFFRVxVyMihSaT0K8AuqdNdwM2\n1VzIzE4F/g0Y5u6f12ddqZ9x42D37tCTR0SkPjIJ/SVALzPraWYtgAuB+ekLmFl/YDoh8NOH/VgA\nnGZm7aITuKdF86QBevSAESNC98133427GhEpJHWGvrvvBEYTwno1MM/dV5rZBDMbFi02Gfgy8KiZ\nLTez+dG67wG3ETYcS4AJ0TxpoPHjYetWmDYt7kpEpJCY59l9/alUysvKyuIuoyAMGwYvvQRvvgmt\nW8ddjYjEycyWunuqruV0R24BKy0Nh3ceeCDuSkSkUCj0C9jJJ8PAgTBlCuzYEXc1IlIIFPoFrrQU\nNm6E2bPjrkRECoFCv8ANHQp9+oTWDLt3x12NiOQ7hX6BqxpkZfVqeOqpuKsRkXyn0C8C558PPXuG\ntst5djGWiOQZhX4RaNYMbroJFi+GP/857mpEJJ8p9IvEpZdChw5hSEURkb1R6BeJAw6A66+HZ5+F\nZcvirkZE8pVCv4hcdVXoua9BVkRkbxT6RaRt2xD8jz4K5eVxVyMi+UihX2Suvx6aNw936YqI1KTQ\nLzKdO4eTujNnwubNcVcjIvlGoV+EbrwRdu6Eu++OuxIRyTfN4i5Asu+II+C88+Cee0L4X3UVHH54\n3FWJSD7Qnn6R+vnP4YwzYOrUsBEYOhSefhp27Yq7MhGJk0K/SHXtGq7ieest+Pd/h1dfhbPOCnv8\nEydCZWXcFYpIHBT6Ra5LF7j11jC61qOPwmGHwQ9/CN26wSWXwMsvq1+PSJIo9BOieXM491xYtAhW\nroRRo+D3v4eTToLjj4f774dPP427ShFpbBmFvpkNMbO1ZlZuZqW1PD/IzP7XzHaa2bk1ntsVDZb+\nzwHTJV69e8MvfgFvvw333RdG3briinBI6IYb4P/+L+4KRaSx1Bn6ZtYUmAYMBXoDI8ysd43F3gIu\nBWobv2mbu/eLHsMaWK9kUZs2cOWVsGJF6M45ZAjcey8cdRScdho8+WS4+kdEikcme/oDgHJ33+Du\n24G5wPD0Bdz9DXdfAWjspgJkFsbanTMnDL14221hUJZzzgnnAH72M3jnnbirFJFsyCT0uwIb06Yr\nonmZamVmZWb2spmdXa/qJOc6dYJbboHXX4fHHw97/bfcAt27w4gR8OKLOvErUsgyCX2rZV59/tsf\n6u4p4CJgqpl94TYhMxsVbRjKKnUtYV5o1izs6T/3HKxZA1dfDc88E74R9OsH06fDJ5/EXaWI1Fcm\noV8BdE+b7gZsyvQXuPum6N8NwAtA/1qWmeHuKXdPlZSUZPrSkiNHHRVu8nr7bZgxIxwOuvLKcDno\ntdeGQ0EiUhgyCf0lQC8z62lmLYALgYyuwjGzdmbWMvq5PfANYNX+Fivxat06XOWzbBm89BIMGxY2\nAr17w+DB8LvfhSuBRCR/1Rn67r4TGA0sAFYD89x9pZlNMLNhAGb2dTOrAM4DppvZymj1Y4AyM3sV\neB6Y6O4K/QJnFq7vf+SRcOL3P/8T1q8P9wH06AE//Slsyvi7oIjkknmenZVLpVJeVlYWdxlST7t2\nwR//CNOmwYIF1ecErr4aTjklbChEpPGY2dLo/Om+l1PoS7aVl4ebvmbOhPffD8f+jzwyNH6rehx+\neHi0aRN3tSLFQaEvsfv0U/jtb0Prh/Xrw2PLlj2X6dgxhH/NDcIRR8DBB8dTt0ghUuhLXvroo+oN\nQHl59WP9eqio2HPZdu323CCk/9yxow4ZiaTLNPQ1iIrk1IEHQv/+4VHTtm2wYcOeG4T16+GVV2De\nPNiddr9369Z73yB06wZN1EpQpFYKfckbBxwAX/1qeNS0fXtoD11zg7BqVRgcZvv26mVbtAjtI2pu\nEI47LowhLJJkCn0pCC1aQK9e4VHTrl3hxrGah4vKy8P5hPSW0UcfHe4p+Jd/gW99C9q3z9mfIJIX\ndExfipp7aBa3bl0YMOb550NH0a1bw/N9+4aNwODBMGgQHHRQvPWK7C+dyBXZix07YMmSsAFYtCjc\nXfzZZ+E8wPHHh28BgwfDN78Zzh2IFAKFvkiGPvssfAtYtChsCF5+OYwj0KwZnHBC9eGgk06CVq3i\nrlakdgp9kf20dSv89a/VG4GysnDlUMuWcPLJ1RuBAQPCMJQi+UChL5IlH34YzgNUHQ569dUwv3Xr\ncAio6pxA//7QtGm8tUpyKfRFGsk//gF/+lP1RqCqtfRBB4U+Q1XnBI49VvcLSO4o9EVyZPNmeOGF\n6sNB69eH+e3bh8tCqw4HHXWU7iKWxqPQF4nJW29VfwtYtKi6vUTnziH8x4wJJ4hFsinT0NeXT5Es\nO/RQGDkSHn44bADWrQvDS55ySmg7ffLJ8OMfa8AZiYdCX6QRmYUWEKNGwZw54dDPJZfAf/wHnHhi\naCMhkksKfZEcOuggeOihMLTkm2/C174Wxh9ObyYn0pgU+iIx+Nd/hddeg1NPhRtugG9/OxwKEmls\nCn2RmHTqBE89FQaXX7wY+vSBWbNCvyCRxqLQF4mRGVxxRbjhq08f+O534bzzwr0AIo0ho9A3syFm\nttbMys2stJbnB5nZ/5rZTjM7t8ZzI81sXfQYma3CRYrJ4YeHG75uvx3mzw8bgD/8Ie6qpBjVGfpm\n1hSYBgwFegMjzKx3jcXeAi4FZtdY92DgVuAEYABwq5m1a3jZIsWnaVMoLQ0jhbVvD2eeCT/4AXzy\nSdyVSTHJZE9/AFDu7hvcfTswFxievoC7v+HuK4Ca1yCcDjzn7u+5+/vAc8CQLNQtUrT69Qutn2+8\nEf7rv8L0Sy/FXZUUi0xCvyuwMW26IpqXiYasK5JYrVrB5MmhvcPOnTBwIPzoR3sOCymyPzIJ/dq6\nhWR6fUFG65rZKDMrM7OyysrKDF9apPgNGgQrVoQ7fG+/PbRveO21uKvKjjVrwvkLya1MQr8C6J42\n3Q3YlOHrZ7Suu89w95S7p0pKSjJ8aZFkOPBAePBBePLJMBZwKgV33lmYN3Rt2QL33ANf/zoccwwM\nHx5uUpPcyST0lwC9zKynmbUALgQy3T4vAE4zs3bRCdzTonkiUk/Dh4e9/NNPh3HjQvfOQgjMbdtg\n7txwYrpLF7juunDIanh0ZlCHrHKrztB3953AaEJYrwbmuftKM5tgZsMAzOzrZlYBnAdMN7OV0brv\nAbcRNhxLgAnRPBHZDx06hD3+Bx+EpUvDpZ0PPZR/N3Tt3h06jV52GXTsCCNGwPLlYWP1t7/BsmXh\nfgTJvWaZLOTufwT+WGPeT9J+XkI4dFPbug8CDzagRhFJYwbf+17o1T9yZPh5/vzQyTPuo6MrV8Ij\nj8BvfgMbN8KXvwznnhuazJ1yikYWywe6I1ekQPXsGfamJ00KN3Ideyw8/XTu6/j73+Guu0LzuGOP\nDVcd9ekDs2fDO+/AzJnhUJQCPz8o9EUKWNOmcNNN4br+Tp3grLNCW4ePP27c37t1a9ibHzIEunaF\nsWPD0JBTp4aTzX/4Qzik86UvNW4dUn8KfZEi0LdvuJN3/Hh44AE47jh48cXs/o5du+B//iccUurU\nCS6+OIwPXFoaxgUoKwsnaTt2zO7vlezK6Ji+iOS/li1h4sRwlcx3vxuu8b/pJpgwITy3v1asCN0/\nZ8+GTZvCJaQXXBCO0w8c2PDB3ydPDpdvdulS/ejcWd8SGotCX6TIfPOboWvn2LHheP+zz4aTq336\nZP4amzaFkJ81K4R+s2YwdGg4fHPmmXDAAQ2vs29fOPLI8Ds+++yLzx900Bc3BOnTVfNatWp4LUmi\ngdFFithTT8H3vw8ffBCGaBw7du8nVD/5BB5/PITwwoXhMtATTgiHcS64oPGuDHIP9W3aBJs3h3/T\nH+nzarumv127fW8cOncOj4Z82ykEmQ6MrtAXKXKVlaFb5xNPhMMxDz8crvyBcJPUwoUh6J94Aj79\nNDx38cXhceSR8daezh3ee6/ujcPmzbUPOn/IIXV/a+jUCVq0yP3flg0KfRH5J3f49a/h2mvDz7fd\nFoZnnDMnXHLZti2cf344Tv+Nb4R7AQrV7t3w7ruZbRx27fri+iUldW8cOnaE5s1z/7fti0JfRL7g\nzTfh0ktD987mzeGMM0LQn3FG8R/+qGn37vAtqK6Nw9///sU+R2bh7uh9bRi6dAnLNMvRmVOFvojU\navfu0J//mGPCIQ/Zt127QqO42jYM6RuHd975YjuMJk3Ct4J9bRi6dAnfLhp681qmoa+rd0QSpkmT\ncIWPZKZp0+qTwV/72t6X27kzBP/evjVUVIR7KbZsqf13dOoUzrnMmdN4fwso9EVEsqJZs3B3ctc6\nhonasSMcMqpt49C5cw7qbPxfISIiVZo3h+7dwyMOasMgIpIgCn0RkQRR6IuIJIhCX0QkQRT6IiIJ\notAXEUkQhb6ISIIo9EVEEiTveu+YWSXwZtx1NFB74B9xF5FH9H7sSe9HNb0Xe2rI+/EVd69z1IO8\nC/1iYGZlmTQ+Sgq9H3vS+1FN78WecvF+6PCOiEiCKPRFRBJEod84ZsRdQJ7R+7EnvR/V9F7sqdHf\nDx3TFxFJEO3pi4gkiEK/gcysu5k9b2arzWylmV0XzT/YzJ4zs3XRv+3irjVXzKypmS0zs6ej6Z5m\ntjh6L35rZi3irjFXzKytmT1mZmuiz8hJCf9s3BD9P3nNzOaYWaskfT7M7EEz22Jmr6XNq/XzYME9\nZlZuZivMbB/jdmVOod9wO4Fx7n4McCJwjZn1BkqBhe7eC1gYTSfFdcDqtOk7gLui9+J94PJYqorH\n3cCz7n40cBzhfUnkZ8PMugJjgJS7Hws0BS4kWZ+Ph4AhNebt7fMwFOgVPUYB92WlAnfXI4sP4PfA\nt4G1QOdoXmdgbdy15ejv7xZ9cAcDTwNGuNmkWfT8ScCCuOvM0XtxIPA60bmztPlJ/Wx0BTYCBxNG\n7XsaOD1pnw+gB/BaXZ8HYDoworblGvLQnn4WmVkPoD+wGOjo7psBon87xFdZTk0FbgZ2R9OHAB+4\n+85ouoLwnz8JDgMqgZnR4a77zaw1Cf1suPvbwBTgLWAz8CGwlOR+Pqrs7fNQtZGskpX3RqGfJWb2\nZeB3wPXu/lHc9cTBzM4Etrj70vTZtSyalEvGmgFfA+5z9/7AVhJyKKc20bHq4UBPoAvQmnAIo6ak\nfD7q0ij/dxT6WWBmzQmB/xt3fzya/Y6ZdY6e7wxsiau+HPoGMMzM3gDmEg7xTAXamlmzaJluwKZ4\nysu5CqDC3RdH048RNgJJ/GwAnAq87u6V7r4DeBw4meR+Pqrs7fNQAaQPn56V90ah30BmZsADwGp3\nvzPtqfnAyOjnkYRj/UXN3X/o7t3cvQfhBN0id/8O8DxwbrRYIt4LAHf/O7DRzI6KZv0/YBUJ/GxE\n3gJONLMvRf9vqt6PRH4+0uzt8zAf+G50Fc+JwIdVh4EaQjdnNZCZfRP4C/A3qo9j/4hwXH8ecCjh\nw36eu78XS5ExMLNvATe6+5lmdhhhz/9gYBlwsbt/Hmd9uWJm/YD7gRbABuB7hJ2tRH42zOynwAWE\nq96WAd8nHKdOxOfDzOYA3yJ003wHuBV4klo+D9GG8V7C1T6fAt9z97IG16DQFxFJDh3eERFJEIW+\niEiCKPRFRBJEoS8ikiAKfRGRBFHoi4gkiEJfRCRBFPoiIgny/wFAtVz29h9pcwAAAABJRU5ErkJg\ngg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x257c70d2390>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.7483003381020914, time elaps:15\n",
      "K-means begin with clusters: 3\n",
      "CH_score: 0.568230363979951, time elaps:6\n",
      "K-means begin with clusters: 4\n",
      "CH_score: 0.5789112930673879, time elaps:5\n",
      "K-means begin with clusters: 5\n",
      "CH_score: 0.4565648707058535, time elaps:6\n",
      "K-means begin with clusters: 6\n",
      "CH_score: 0.4589168079486463, time elaps:6\n",
      "K-means begin with clusters: 7\n",
      "CH_score: 0.4520717113120725, time elaps:6\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.4196654772047621, time elaps:6\n",
      "K-means begin with clusters: 9\n",
      "CH_score: 0.46488096638992216, time elaps:5\n"
     ]
    }
   ],
   "source": [
    "#得分最高的不到0.5，继续调优\n",
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [2,3,4,5,6,7,8,9]\n",
    "CH_scores = []\n",
    "\n",
    "for K in Ks:\n",
    "    ch= K_cluster_analysis(K,train)\n",
    "    CH_scores.append(ch)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3XmYFPW97/H3l31zYRk3FgEFZUzc\n0gGXqAkahVwC7gFPosToTCJExSWPiVEietR79ahZjCcIJhwTQQQX1CjhuORwfcQwqKhAUERZgugo\niMaALH7vH7+aSzPMTPfM9HRVd39ez9PPTFdXT30HnU9V/75VvzJ3R0RESkOruAsQEZH8UeiLiJQQ\nhb6ISAlR6IuIlBCFvohICVHoi4iUEIW+iEgJUeiLiJQQhb6ISAlpE3cBtfXo0cP79u0bdxkiIgVl\n0aJFH7p7Wab1Ehf6ffv2paqqKu4yREQKipmtymY9De+IiJQQhb6ISAlR6IuIlBCFvohICckq9M1s\nmJktN7MVZnZNHa/faWavRo83zezjtNd2pL02J5fFi4hI42Q8e8fMWgN3A98E1gILzWyOuy+tWcfd\nJ6St/2PgqLQfsdndj8xdySIi0lTZHOkPBla4+0p33wrMAEY1sP4YYHouihMRkdzKJvR7AmvSnq+N\nlu3GzA4E+gHPpi3uYGZVZrbAzE6v530V0TpV1dXVWZa+qw0b4Be/gDfeaNLbRURKQjahb3Usq+/G\nuqOBWe6+I21ZH3dPAecBd5nZQbv9MPfJ7p5y91RZWcYLyurkDrfeCv/5n016u4hIScgm9NcCvdOe\n9wLW1bPuaGoN7bj7uujrSuB5dh3vz5nu3eHss+H+++Gzz1piCyIihS+b0F8IDDCzfmbWjhDsu52F\nY2aHAF2BF9OWdTWz9tH3PYDjgaW135srlZXwySfw4IMttQURkcKWMfTdfTswHpgLLANmuvsSM5tk\nZiPTVh0DzHD39KGfQUCVmS0GngNuTT/rJ9e+9jUYNAh+97uW2oKISGGzXTM6fqlUypsz4dpdd8GE\nCfDKK3CkThQVkRJhZoui/mmDiu6K3PPPh/btYfLkuCsREUmeogv9bt3g3HPhj3+Ef/4z7mpERJKl\n6EIfQkP3009hxoy4KxERSZaiDP3jjoPycg3xiIjUVpShbxaO9hcuDA1dEREJijL0Ab73PejQQadv\nioikK9rQ79oVvvMd+NOfwvi+iIgUcegDVFSEM3ima85PERGgyEP/2GPhS19SQ1dEpEZRh35NQ3fR\novAQESl1RR36AN/9LnTsqIauiAiUQOjvvTeMHg0PPBBm4BQRKWVFH/oQGrqffaaGrohISYT+kCFw\n+OFhiCdhk4qKiORVSYR+TUP3lVegGbM2i4gUvJIIfYB/+zfo1EkNXREpbSUT+nvtFRq606eroSsi\npatkQh/CEM+//hWmZhARKUUlFfpf/Wq4haIauiJSqkoq9GsauosXw9/+Fnc1IiL5V1KhD3DeedC5\nsxq6IlKasgp9MxtmZsvNbIWZXVPH63ea2avR400z+zjttQvM7K3ocUEui2+KPfeEMWPCrRQ3bYq7\nGhGR/MoY+mbWGrgbGA6UA2PMrDx9HXef4O5HuvuRwK+Bh6P3dgMmAkOAwcBEM+ua21+h8SorYfPm\ncPN0EZFSks2R/mBghbuvdPetwAxgVAPrjwFqJjw4DZjn7hvcfSMwDxjWnIJzIZWCo49WQ1dESk82\nod8TWJP2fG20bDdmdiDQD3i2Me81swozqzKzqurq6mzqbrbKSnj9dViwIC+bExFJhGxC3+pYVt/x\n8WhglrvvaMx73X2yu6fcPVVWVpZFSc03Zgx06aIbrIhIackm9NcCvdOe9wLW1bPuaHYO7TT2vXm1\nxx7hTJ4HH4SPP868vohIMcgm9BcCA8ysn5m1IwT7nNormdkhQFfgxbTFc4FTzaxr1MA9NVqWCDUN\n3fvvj7sSEZH8yBj67r4dGE8I62XATHdfYmaTzGxk2qpjgBnuO1uj7r4BuJGw41gITIqWJcLRR4em\nrhq6IlIqzBOWdqlUyqvyOP/xvfeGm6y88AIcd1zeNisiklNmtsjdU5nWK7krcmsbMyaM7+sKXREp\nBSUf+l26hLn2Z86EjRvjrkZEpGWVfOhDaOhu2QL/9V9xVyIi0rIU+oTplgcPVkNXRIqfQj9SUQHL\nloWGrohIsVLoR0aPDjNwqqErIsVMoR/p3Bm++1146CH46KO4qxERaRkK/TQVFfD552roikjxUuin\nOeIIGDIkTMKmhq6IFCOFfi2VlfD3v8P8+XFXIiKSewr9Wr7zHdhrLzV0RaQ4KfRr6dQJvvc9mDUL\nPvww7mpERHJLoV+HigrYulUNXREpPgr9Onz5y3DssWroikjxUejXo7ISli+Hv/417kpERHJHoV+P\nc8+FvfdWQ1dEiotCvx4dO8L558Ps2VBdHXc1IiK5odBvQEUFbNsG06bFXYmISG4o9Btw2GFw/PFq\n6IpI8VDoZ1BZCW+9Bc89F3clIiLNp9DP4OyzoWtXNXRFpDhkFfpmNszMlpvZCjO7pp51zjWzpWa2\nxMweSFu+w8xejR5zclV4vtQ0dB95BD74IO5qRESaJ2Pom1lr4G5gOFAOjDGz8lrrDAB+Chzv7ocB\nl6e9vNndj4weI3NXev5UVoaG7h/+EHclIiLNk82R/mBghbuvdPetwAxgVK11LgbudveNAO5eVMfE\ngwbBCSeEhu4XX8RdjYhI02UT+j2BNWnP10bL0g0EBprZC2a2wMyGpb3WwcyqouWn17UBM6uI1qmq\nTuhJ8ZWV8Pbb8OyzcVciItJ02YS+1bGs9gmMbYABwNeBMcAUM9s7eq2Pu6eA84C7zOyg3X6Y+2R3\nT7l7qqysLOvi8+mss6Bbt3C0LyJSqLIJ/bVA77TnvYB1dazzmLtvc/d3gOWEnQDuvi76uhJ4Hjiq\nmTXHokMHuOCC0NB9//24qxERaZpsQn8hMMDM+plZO2A0UPssnEeBbwCYWQ/CcM9KM+tqZu3Tlh8P\nLM1V8flWUQHbt8Pvfx93JSIiTZMx9N19OzAemAssA2a6+xIzm2RmNWfjzAU+MrOlwHPA1e7+ETAI\nqDKzxdHyW929YEP/0EPhpJPg3nvV0BWRwmSesPkFUqmUV1VVxV1GvaZPh/POg7lz4dRT465GRCQw\ns0VR/7RBuiK3kc48E7p3V0NXRAqTQr+R2reHsWPhscdg/fq4qxERaRyFfhPUNHTvuy/uSkREGkeh\n3wQDB8I3vqGGrogUHoV+E1VUwLvvwrx5cVciIpI9hX4TnXEG9OihKZdFpLAo9JuofXv4/vdhzhxY\nV/v6ZBGRhFLoN0NFBezYoYauiBQOhX4zHHwwnHwyTJkSwl9EJOkU+s1UUQGrVsFf/hJ3JSIimSn0\nm+n002GffdTQFZHCoNBvpnbtQkP3iSfgH/+IuxoRkYYp9HPg4ovDmP7UqXFXIiLSMIV+Dhx0EJxy\nihq6IpJ8Cv0cqayENWvg6afjrkREpH4K/RwZNQr23VcNXRFJNoV+jrRtCxdeCE8+GY74RUSSSKGf\nQxdfHGbd1BW6IpJUCv0c6tcv3EJxypQw376ISNIo9HOsshLWroWnnoq7EhGR3Sn0c+zb34b99lND\nV0SSKavQN7NhZrbczFaY2TX1rHOumS01syVm9kDa8gvM7K3ocUGuCk+qtm3hBz8IR/qrV8ddjYjI\nrjKGvpm1Bu4GhgPlwBgzK6+1zgDgp8Dx7n4YcHm0vBswERgCDAYmmlnXnP4GCXTRReCuK3RFJHmy\nOdIfDKxw95XuvhWYAYyqtc7FwN3uvhHA3T+Ilp8GzHP3DdFr84BhuSk9ufr2hdNOU0NXRJInm9Dv\nCaSfeb42WpZuIDDQzF4wswVmNqwR78XMKsysysyqqqurs68+wSorwx21nnwy7kpERHbKJvStjmVe\n63kbYADwdWAMMMXM9s7yvbj7ZHdPuXuqrKwsi5KSb8QI2H9/NXRFJFmyCf21QO+0572A2neFXQs8\n5u7b3P0dYDlhJ5DNe4tSmzahofv00+EmKyIiSZBN6C8EBphZPzNrB4wG5tRa51HgGwBm1oMw3LMS\nmAucamZdowbuqdGyknDRReHrlCnx1iEiUiNj6Lv7dmA8IayXATPdfYmZTTKzkdFqc4GPzGwp8Bxw\ntbt/5O4bgBsJO46FwKRoWUk48EAYPjycxbNtW9zViIiAue82xB6rVCrlVVVVcZeRM3PmhBk4H34Y\nzjgj7mpEpFiZ2SJ3T2VaT1fktrBvfQt69oTJk+OuREREod/iahq6c+fCu+/GXY2IlDqFfh5cdBGY\nwb33xl2JiJQ6hX4e9O4dhnnuu08NXRGJl0I/TyorYf16ePzxuCsRkVKm0M+TYcOgVy9doSsi8WoT\ndwGlok2bMLb/i1/AypXQv3/cFTVsyxZ4+214883wWLMGrroqTCYnIoVLoZ9HP/gBTJoUGrq33BJ3\nNWEG0FWr4K23doZ7zWP16jA9dA0zWLEi3CfA6ppRSUQKgi7OyrNRo2DBgnDk3K5dy2/PHd57LwR5\n7XB/++1dG8t77gkDB+7+GDAgNKEnTIBHHw2/g4gkS7YXZyn08+zJJ8MMnA89BGefnbufu3HjzjCv\nHe6ffbZzvfbtQ4gPGLB7uJeV1X8Uv20bHHkkbN4MS5ZAx465q11Emi/b0NfwTp4NGwZ9+oSGbmND\n/1//CkMsdQX7hx/uXK9VK+jXLwT5CSfsGuy9e4fXG6ttW/j1r+Hkk+H22+G66xr/M0Qkfgr9PGvd\nOjR0r78+BPjBB+/6+rZt4crduo7a16zZdd0DDghBfuaZux659+/fMkNHQ4fCOeeEfsT554cJ5USk\nsGh4Jwbr1oWj/fPOg+OO2zXcV67c9RaLe+8Nhxyy+xj7gAHQpUv+a1+9Gg49NFxsNmtW/rcvInXT\n8E6CHXBAaIbef394dOwYQvzww8OQT02wDxwI3bsn62yZPn3gZz8LwzvPPBOGe0SkcOhIPyYbNsBr\nr8FBB4VZOJsyzh6XLVvgsMNCU3jx4jDeLyLx0tTKCdetG3z9601vrMapQwe46y5Ytgx+85u4qxGR\nxiiwuJGkGDEi3BVs4sQwp5CIFAaFvjSJWTja37IFrrkm7mpEJFsKfWmygQPhyith2jR48cW4qxGR\nbCj0pVmuvTY0osePhx074q5GRDJR6EuzdOkCt90GL78MU6fGXY2IZJJV6JvZMDNbbmYrzGy3EVwz\nG2tm1Wb2avS4KO21HWnL5+SyeEmG0aPhxBPD+fsbNsRdjYg0JGPom1lr4G5gOFAOjDGz8jpWfdDd\nj4weU9KWb05bPjI3ZUuSmIV5eTZuDNNLiEhyZXOkPxhY4e4r3X0rMAPQ5Lqyi8MPh0sugXvuCRds\niUgyZRP6PYH0qb7WRstqO8vMXjOzWWbWO215BzOrMrMFZnZ6c4qVZJs0KVx09uMf73oDFhFJjmxC\nv66ZX2r/ST8O9HX3w4H/BqalvdYnujT4POAuMztotw2YVUQ7hqrq6uosS5ek6do1zMA5fz5Mnx53\nNSJSl2xCfy2QfuTeC1iXvoK7f+Tun0dP7wW+kvbauujrSuB54KjaG3D3ye6ecvdUWVlZo34BSZYL\nL4RUCq6+Gj79NO5qRKS2bEJ/ITDAzPqZWTtgNLDLWThmtn/a05HAsmh5VzNrH33fAzgeWJqLwiWZ\nWrUK8/GsWwc33RR3NSJSW8bQd/ftwHhgLiHMZ7r7EjObZGY1Z+NcamZLzGwxcCkwNlo+CKiKlj8H\n3OruCv0iN2QIjB0Ld94Jy5fHXY2IpNPUytIi3n8/TNNw7LHw1FPJuieASDHS1MoSq333hRtugLlz\nYY4uyRNJDIW+tJhx46C8HCZMgM2b465GREChLy2obdtwpe4778Dtt8ddjYiAQl9a2NChcM454fz9\nVavirkZEFPrS4mqO8q+8Mt46REShL3nQp0+YgXP2bHjmmbirESltCn3Ji6uugv79w7w827bFXY1I\n6VLoS1506BDuqbtsWbhiV0TiodCXvBkxAoYPh4kTYf36uKsRKU0Kfckbs3C0v2ULXLPb/ddEJB8U\n+pJXAweGs3imTYMXX4y7GpHSo9CXvLv2WujZMzR1d+yIuxqR0qLQl7zr0gVuuw0WLYKpU+OuRqS0\nKPQlFqNHw4knhvP3N2yIuxqR0qHQl1iYhXl5Nm6E66+PuxqR0qHQl9gcfjhccgnccw8sXhx3NSKl\nQaEvsZo0Cbp1C03dhN3PR6QoKfQlVl27ws03w/z5MH163NWIFD+FvsTuwgshlYKrr4ZPP427GpHi\nptCX2LVuHebjWbcObrop7mpEiptCXxJhyBAYOxbuvBOWL4+7GpHilVXom9kwM1tuZivMbLdZU8xs\nrJlVm9mr0eOitNcuMLO3oscFuSxeisutt0LHjnD55WrqirSUjKFvZq2Bu4HhQDkwxszK61j1QXc/\nMnpMid7bDZgIDAEGAxPNrGvOqpeisu++cMMN8PTT8PjjcVcjUpyyOdIfDKxw95XuvhWYAYzK8uef\nBsxz9w3uvhGYBwxrWqlSCsaNg/LycLS/eXPc1YgUn2xCvyewJu352mhZbWeZ2WtmNsvMejfyvSIA\ntG0brtR9552d99YVkdzJJvStjmW1R1wfB/q6++HAfwPTGvFezKzCzKrMrKq6ujqLkqSYDR0K55wD\nt9wCq1bFXY1Icckm9NcCvdOe9wLWpa/g7h+5++fR03uBr2T73uj9k9095e6psrKybGuXIlZzlH/l\nlfHWIVJssgn9hcAAM+tnZu2A0cCc9BXMbP+0pyOBZdH3c4FTzaxr1MA9NVom0qA+fcIMnLNnwzPP\nxF2NSPHIGPruvh0YTwjrZcBMd19iZpPMbGS02qVmtsTMFgOXAmOj924AbiTsOBYCk6JlIhlddRX0\n7x/m5dm2Le5qRIqDecJOiE6lUl5VVRV3GZIQjz8OI0fCHXfAhAlxVyOSXGa2yN1TmdbTFbmSaCNG\nwPDhMHEirF8fdzUihU+hL4lmBnfdBVu2wDW7XQsuIo2l0JfEGzgQrrgCpk2DF1+MuxqRwqbQl4Lw\n85/DAQeEpu6OHXFXI1K4FPpSELp0CefuL1oEU6fGXY1I4VLoS8EYPRpOPDGcv79BJ/6KNIlCXwqG\nGfzqV7BxI1x/fdzViBQmhb4UlCOOgEsugXvugcWL465GpPAo9KXgTJoE3bqFpm7Cri0USTyFvhSc\nrl3h5pth/nyYPj3uakQKi0JfCtKFF8JXvgJXXw2ffhp3NSKFQ6EvBal1a/jNb2DdOrjpprirESkc\nCn0pWMccA2PHwp13wvLlcVcjUhgU+lLQbr0VOnYM99RVU1ckM4W+FLR994UbboCnnw7TMIsUqm3b\n8jOTrEJfCt64cVBeHo72N2+OuxqRxluwAFIpOPNM+OKLlt2WQl8KXtu28Otfwzvv7Ly3rkgh2LQp\nXGx43HFhapGf/ARatXAqK/SlKAwdCmefDbfcAqtWxV2NSMPc4aGH4NBD4Xe/g8sug6VL4fTTW37b\nbVp+EyL58R//AU8+Ge6t+9BD+d++O2zfHm74snlz+NrQ9zXPt24NVxjvt9/OR/fuLX/EJ/F4990w\nJPnnP8PRR8MTT4RrTvJFoS9Fo0+fMAPnddeFP6jBgzMHb2PCOZvvczUe26ZNaFLX7AT233/XnUL6\n806dcrNNaVnbtoW7wE2cGHbod94J48eH/9b5pBujS1HZsgUOOwxWrmz6z2jXLpwG2qFDeLT0923b\nhvHc994LZ2/UPGo//+CDuncqe+zR8E6h5nmPHuGiNsm/l16Cigp47TUYNSr0oHr3zu02sr0xelb7\nGDMbBvwSaA1Mcfdb61nvbOAh4KvuXmVmfYFlQM2lMwvc/YfZbFOkKTp0gLlz4ZFHdoZqY4K4fft4\nhlW6d4cBAxpeZ8cOqK6uf6ewfj288kr4WtfUFK1awT77ZP7ksN9+YUcizbdpU/j0ec894c5vjzyS\nn3H7hmQMfTNrDdwNfBNYCyw0sznuvrTWensAlwIv1foRb7v7kTmqVySjgw8Oc/IUm9atd4ZyJp99\nBu+/3/Cnh9deC1/ruv1k58677gTSdwxHHRXGoqV+7jB7Nlx6afjvcOmlcOONydiZZnOkPxhY4e4r\nAcxsBjAKWFprvRuB/wNcldMKRaTROneG/v3DoyFffAEffVT/p4f33oMlS+CZZ+Djj3e+72tfgyuv\nhG9/W0NGtb37bhirf/LJsIOcMyecg58U2YR+T2BN2vO1wJD0FczsKKC3uz9hZrVDv5+ZvQJ8Avzc\n3ec3p2ARyZ1WraCsLDy+/OWG192yJewEHn0UfvlLOOOM8Knq8svDHEidO+el5MTati38u0ycGO7y\ndscd4Z4P+W7UZpLN6KXVsez/d3/NrBVwJ3BlHeu9B/Rx96OAK4AHzGzP3TZgVmFmVWZWVV1dnV3l\nIpJXHTpAv34wYQKsWAEPPhhONR0/Ppw5de21YadQil56Cb761TCsePLJ4Zz7CROSF/iQXeivBdL7\nzL2AdWnP9wC+BDxvZu8CxwBzzCzl7p+7+0cA7r4IeBsYWHsD7j7Z3VPuniorK2vabyIiedOmDZx7\nbpg+YP78cMP6W26BAw8MR/2vvRZ3hfmxaVPY6R17LHz4ITz8MDz2WNgJJlU2ob8QGGBm/cysHTAa\nmFPzortvcvce7t7X3fsCC4CR0dk7ZVEjGDPrDwwAmnEynYgkiVkY33/kkTC9dUVFuDDuiCPg1FPD\nmVQJOys8J9xh1qww59NvfxuGcZYuDUNeVtfYSIJkDH133w6MB+YSTr+c6e5LzGySmY3M8PYTgdfM\nbDEwC/ihu29obtEikjwDBoQb26xZA//+7/D66zBsWOgV3HcffP553BXmxqpVoYF9zjnhArqXXgpj\n+XvuNnCdTLo4S0RaxOefw4wZYXqM118PATl+PPzoR+G6hEKzfXsI9+uvD89vvDGcipmUcftsL87S\n7B4i0iLat4cLLoDFi2HevHD64nXXhStRf/QjePPNuCvM3t/+Fhq1V121s1F7xRXJCfzGUOiLSIsy\ng1NOgaeegjfegDFjwnDPoYeGKQn+53+SO+7/ySdhvP6YY8I0GLNnh0btgQfGXVnTKfRFJG8OOwym\nTg3j4j//ObzwApx0Upgcb/r0cK57EtRcUTtoENx9dxiWWrYs3OQk6Y3aTBT6IpJ3++0HkybB6tVh\nXppPPoHzzoODDgo9gE2b4qtt1SoYOTLcn2GffcJpqb/6VeE0ajNR6ItIbDp1gh/+MBxFz5kTpo24\n6qow7n/FFfm9Ic727WGHU14Ozz4bvl+4MHwKKSYKfRGJXatW4TTI558PQTtiRDi6PuggGD06LGtJ\nCxfubNQOHVrYjdpMFPoikiipFDzwQLgnwoQJoQE8eDCccEKY96euWUGb6pNPwmmXQ4aE2TBnzQqf\nOAq5UZuJQl9EEqlPH7jttnCx1x13hK9nnBHO+vntb8P00U3lHqZMGDQoXFA2blwYYjrrrMJv1Gai\n0BeRRNtzz90neRs3rumTvK1eHU4VPeusMLvoggXhTlZ77dUy9SeNQl9ECkLtSd5OOilM8ta3L3z/\n+5knedu+PXxiKC8P9we4/Xaoqiq+Rm0mCn0RKSg1k7w9/HC4qvfii2HmzIYneasJ9yuvDDuLJUvC\n98XYqM1EoS8iBevgg3dO8nbzzeGK3/RJ3j78EC67LDRq168PM4A+8UT4dFCqNOGaiBSN2pO81TRl\nL7kkzPxZzOP22U64VoIfbkSkWNVM8nb++WHc/vHHw5W+Q4Zkfm+pUOiLSNGpmeTtlFPiriR5NKYv\nIlJCFPoiIiVEoS8iUkIU+iIiJUShLyJSQhT6IiIlRKEvIlJCFPoiIiUkcdMwmFk10JybpPUAPsxR\nOS2tkGqFwqq3kGqFwqq3kGqFwqq3ObUe6O5lmVZKXOg3l5lVZTP/RBIUUq1QWPUWUq1QWPUWUq1Q\nWPXmo1YN74iIlBCFvohICSnG0J8cdwGNUEi1QmHVW0i1QmHVW0i1QmHV2+K1Ft2YvoiI1K8Yj/RF\nRKQeRRH6ZtbbzJ4zs2VmtsTMLou7poaYWQcz+5uZLY7qvSHumjIxs9Zm9oqZPRF3LZmY2btm9rqZ\nvWpmib4Nm5ntbWazzOzv0f+/x8ZdU33M7JDo37Tm8YmZXR53XfUxswnR39cbZjbdzDrEXVN9zOyy\nqM4lLf1vWhTDO2a2P7C/u79sZnsAi4DT3X1pzKXVycwM6Ozu/zSztsD/BS5z9wUxl1YvM7sCSAF7\nuvuIuOtpiJm9C6TcPfHnZpvZNGC+u08xs3ZAJ3f/OO66MjGz1sA/gCHu3pzralqEmfUk/F2Vu/tm\nM5sJ/Nnd/xBvZbszsy8BM4DBwFbgaeBH7v5WS2yvKI703f09d385+v5TYBnQM96q6ufBP6OnbaNH\nYve+ZtYL+F/AlLhrKSZmtidwIjAVwN23FkLgR04G3k5i4KdpA3Q0szZAJ2BdzPXUZxCwwN3/5e7b\ngb8CZ7TUxooi9NOZWV/gKOCleCtpWDRc8irwATDP3ZNc713AT4Av4i4kSw78xcwWmVlF3MU0oD9Q\nDfw+GjqbYmad4y4qS6OB6XEXUR93/wdwO7AaeA/Y5O5/ibeqer0BnGhm3c2sE/AtoHdLbayoQt/M\nugCzgcvd/ZO462mIu+9w9yOBXsDg6CNe4pjZCOADd18Udy2NcLy7Hw0MB8aZ2YlxF1SPNsDRwD3u\nfhTwGXBNvCVlFg1DjQQeiruW+phZV2AU0A84AOhsZt+Nt6q6ufsy4H8D8whDO4uB7S21vaIJ/Whs\nfDbwJ3d/OO56shV9nH8eGBZzKfU5HhgZjZPPAIaa2R/jLalh7r4u+voB8AhhrDSJ1gJr0z7lzSLs\nBJJuOPCyu78fdyENOAV4x92r3X0b8DBwXMw11cvdp7r70e5+IrABaJHxfCiS0I8ao1OBZe5+R9z1\nZGJmZWa2d/R9R8L/oH+Pt6q6uftP3b2Xu/clfKR/1t0TecQEYGado2Y+0VDJqYSPz4nj7uuBNWZ2\nSLToZCCRJx/UMoYED+1EVgPHmFmnKB9OJvT6EsnM9om+9gHOpAX/fdu01A/Os+OB7wGvR+PkAD9z\n9z/HWFND9gemRWdAtAJmunvKuhkWAAAAhElEQVTiT4UsEPsCj4S/c9oAD7j70/GW1KAfA3+KhkxW\nAt+PuZ4GRWPO3wQq466lIe7+kpnNAl4mDJW8QrKvzJ1tZt2BbcA4d9/YUhsqilM2RUQkO0UxvCMi\nItlR6IuIlBCFvohICVHoi4iUEIW+iEgJUeiLiJQQhb6ISAlR6IuIlJD/B/bljYnUkcYPAAAAAElF\nTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x257c73dd2e8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-47.43188286,  10.9174202 ],\n",
       "       [-49.36571383,  11.39233661],\n",
       "       [-44.82562898,   9.29477031],\n",
       "       ...,\n",
       "       [-46.74973476,   9.71984271],\n",
       "       [-50.6341985 ,  10.75877177],\n",
       "       [ 72.86436051, -17.80160954]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_pca"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第四部分：结果展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD8CAYAAACCRVh7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAG5pJREFUeJzt3X+8VXWd7/HXmx8eUiARj2ICAQop\nZhEeTZvrj5QQvTikjQ5NGdOU5Dx0cu7lTqXemaKmx+NaVmNleqksNe6YiSmjjSkaY965mucIGT80\nDyAjysARBSwR+fG5f6x1OJvDXhw8+6y9zt7n/Xw89uOs9V1r7f1ZZ4uf8/2u7/osRQRmZmbl9Cs6\nADMz672cJMzMLJOThJmZZXKSMDOzTE4SZmaWyUnCzMwyDcj7AyQ9D7wG7AJ2RkSTpMOAnwJjgOeB\nSyLi1bxjMTOzt6ZaPYkPRsSkiGhK178APBwR44GH03UzM+tlihpumgHcmi7fCny4oDjMzGw/lPcd\n15LWAK8CAfzviJgnaXNEHFqyz6sRMazMsbOB2QCHHHLISccdd1yusZqZ1ZuWlpaXI6Kxu8fnfk0C\n+JOIeEnSEcBDkp450AMjYh4wD6CpqSmam5vzitHMrC5JWlvJ8bkPN0XES+nPjcDPgVOADZKOAkh/\nbsw7DjMze+tyTRKSDpE0pH0ZmAosAxYCs9LdZgH35hmHmZl1T97DTUcCP5fU/ln/JyIekPQkcKek\nTwH/AVyccxxmZtYNuSaJiFgNvLdM+ybgnDw/28zMKuc7rs3MLJOThJmZZXKSMDOzTE4SZmaWyUnC\nzMwyOUmYmVkmJwkzM8vkJGFmZpmcJMzMLJOThJmZZXKSMDOzTE4SZmaWyUnCzMwyOUmYmVkmJwkz\nM8vkJGFmZpmcJMzMLJOThJmZZco1SUgaJelXklZKWi7pqrT9S5JelLQ0fZ2fZxxmZtY9uT7jGtgJ\nzImIpyQNAVokPZRu+1ZEXJ/z55uZWQVyTRIRsR5Yny6/JmklcHSen2lmZj2natckJI0B3gc8kTZd\nKelpSbdIGlatOMzM7MBVJUlIGgwsAP42IrYCNwHHAJNIehrfyDhutqRmSc1tbW3VCNXMzErkniQk\nDSRJEPMj4m6AiNgQEbsiYjfwfeCUcsdGxLyIaIqIpsbGxrxDNTOzTvKe3STgh8DKiPhmSftRJbtd\nCCzLMw4zM+uevGc3/QlwKfA7SUvTtmuAj0qaBATwPPCZnOMwM7NuyHt202OAymz6RZ6fa2ZmPcN3\nXJuZWSYnCTMzy+QkUSNuvBFGj4YRI+Af/gEiio7IzPoCJ4ka0NICV14JkyfDjh3wla/AJZc4UZhZ\n/pwkasDChcnPe++FzZuT5bvugsZG2LWruLjMrP45SdSADRs6lnfv7ljetAkuuqj68ZhZ3+EkUQOO\nPHLv9f79O5YffLC6sZhZ3+IkUQOmT997vXSIafv26sZiZn2Lk0QNOPlkOO20jvVTT917u2c9mVle\nnCRqxA03dCw/8UTHckQy62nnzo5ZT6+88irHHXccgwYNYtiwYXz84x/njTfeqH7QZlbznCRqxMkn\nw5w5yXLn3sK99yZJAl7lrrvGM3z4YTz77LPs3r2bESNGMH/+fO6///5qh2xmdcBJooZcfz18+9sw\nciQccQS8730d2666CgYPHgj8Ne1f644dO3jmmWeQxDvf+c5CYjaz2uYkUWP+5m/ghReSabEXXNDR\nftllsGvXYOB8YPBex0QE73//+z30ZGZvmZNEDSud9bRgAWzbBrAW+OM++5599tl8+tOf9tCTmb0l\nThI1rHTW0zXXtPcszuCIIy4ts+/JjB49moaGBkaMGOEL22Z2QJwkalz7rKeBA+GRR5YCtzJmzNZ9\n9rvuuuv47Gc/y5lnnskxxxzDV7/6VZYtW+behZntl5NEjTv55ORi9pAhMGDARgYO/By/+c3d++w3\nbdo0vvKVr/Dggw9y55138pGPfIRjjz12T+9iwoQJBURvZr2dk0QdaL+YvXnzVD7zmVll9znttNM4\n5JBDADj44IP59a9/zdve9rY9vYuxY8dWM2QzqxFOEnXmggsuoLGxcc96Q0MDgwcPZu7cuVx33XVc\nccUVzJo1i6amJpYsWbKnd3HLLbcUGLWZ9Va5PuN6fyRNA24A+gM/iIj/VVQs9WTq1Kls3Lhxv/ss\nXbqUl19+mXHjxu3VuzAz66yQJCGpP3Aj8CFgHfCkpIURsaKIePqajRs3ctlll7F+/XoOO+ywPb0L\nM7POiupJnAK0RsRqAEl3ADMAJ4kqmDp1KmvXri06DDOrAUVdkzgaeKFkfV3athdJsyU1S2pua2ur\nWnBmZpYoKkmoTNs+Ra4jYl5ENEVEU+nFWDMzq46iksQ6YFTJ+kjgpYJisTJ27NrBB374ARr+sQHN\nFc9vfr7okMysAEUliSeB8ZLGSjoImAksLCgWK0MS0ydMZ8a7ZhQdipkVqJAkERE7gSuBXwIrgTsj\nYnkRsVh5A/oN4JrTr2HC8ORObPcszPqmwu6TiIhfAL8o6vOtvB27dnDmj8+k+aVmduzeQT8lf0e8\n+NqLTJ8wnZFDR/KzFT8rOEozq5bCkoT1Tu3DTO8Y8g4WrFzA2EPHsurVVZw//3y27dxWdHhmVmUu\ny2F7aR9mOu7w4wAYNGAQAMcMO4ajBh9VZGhmVgD3JGwv7cNNT7z4BADL25JLRcvalnHEwUfstd8H\nfvgBWta38OauN1lz1RrGHDqmiJDNLEfuSdhe2oeb3jX8XQB84j2fAGDNVWs4emjH/Y5rNq/hjHee\n4dlPZnXOPQnby4B+A7jo+ItYsGIBkFywBlj1yiqW/OeSPfud+5NzaTy4kU3bNgGwbus69yTM6pCT\nhO3j+BuP37P88JqHAZhy+xTe3vB2tmzfwtemfI0Pjv0gD656kJ88/RNWvryyqFDNLGcebrJ9xBeD\na0+/FkiGmeKLScWULdu3APC5RZ9j5l0zueb0azjsbYcVFqeZ5c89CdvHMy8/w6bXk2GkVa+soqF/\nw55E0Xm/bTuSabFrN69NZkAN8Qwos3qiiH3/8fdGTU1N0dzcXHQYfYLm7l1/cdZ7Z/HjD/+42/uZ\nWXEktUREU3ePd0/C9lGu11DOyitWcsPjN3Bzy80sunQRExsn5hyZmVWbr0lYtx1/4/Hc3HIzkFzY\nvvrhq7v3RitWwKRJcPDBcPjh8Hd/14NRmlkl3JOwbjvQHkeX3ngDLr0UzjsPvvtduP76ZPnss3vm\n/c2s25wkrHiTJycvgHPOgZtugldeKTYmMwM83GS9xYoVcOKJcPHF0K8fPPZY0RGZGU4S1lu0tcGW\nLdDYCDNnwg03wCOPFB2VWZ/nJGHF27o1uVj9hz/AT38KZ56ZtP/2t76gbVYwX5Ow4j31FDz5ZLL8\nwQ8mP4cNg1NOgd27fUHbrEDuSVjxzjoLIuDxx5NeAyTr99wDc+bAxInJBW3wBW2zKsstSUj6uqRn\nJD0t6eeSDk3bx0jaJmlp+ro5rxishmzdCp/6VJIcfvQjuOCCpOfwyCPJtYq5c+HYY+H884uO1KxP\nybMn8RDw7oh4D/B7oPROq1URMSl9XZ5jDNaLtD+oqOEfG9Bc8fzm5zs2PvUULF8O27bBJz8Jt9+e\ntL/wAkydCps2wQMPdPQ0zKwqcksSEfFgROxMVx8HRub1WVYb2h9oVPZBRe1DThGweXMyHXbcOPjO\nd+C552D+fGhoSHocZlY11bom8VfAv5asj5W0RNK/STo96yBJsyU1S2pua2vLP0rLVfvzsycMn5C9\n05YtHT2HL38ZWlrg1VeTC9qjRsE3v1m9gM2sstlNkhYBI8psujYi7k33uRbYCcxPt60HRkfEJkkn\nAfdIOiEi9vkTMSLmAfMgqQJbSaxWA7ZuhQ99CFpb4e67k2sQW7bA0KFFR2bWZ1WUJCJiyv62S5oF\nTAfOibQmeURsB7anyy2SVgETANcB7+vKTYX94hfhS18qLCSzvi63+yQkTQM+D5wZEa+XtDcCr0TE\nLknjgPHA6rzisN6l3AON9jyoqP26hJn1GnneTPddoAF4SBLA4+lMpjOAL0vaCewCLo8IT37vI0qf\nnz3l9il+UJFZL5dbkoiIYzPaFwAL8vpc6926LC++YgX8xV/A73+fTHf95Cfh61+vTnBmtg/fcW29\nS/uzJZqb4ZJLOm6oM7NCuHaT9S6lz5aYkE6VPe88GDLEvQqzArgnYb3Tli1w441J9dd//3f3KswK\n4iRhvU/7DXWvv54U/TvpJBf4MyuIh5usd+l8Q11DQ1K/yQX+zArhJGG9S7kb6t7xjuTno4+6wJ9Z\nlXm4yXqX0kJ/W7bAyScnlWFd4M+sEE4S1nu19yrKFfhbscKPNjWrAg83We+1vzId7fdT+NGmZrly\nkrDaVHo/xTnnwE03eeaTWQ483GS1zY82NcuVexJWu0ofUOSZT2a5cE/CalP7/RR+tKlZrpwkrDbt\nb+aTmfUYDzdZbfIDisyqwj0JMzPL5CRhZmaZnCTMzCxTbklC0pckvShpafo6v2Tb1ZJaJT0r6dy8\nYjCriEt/mOXek/hWRExKX78AkDQRmAmcAEwDviepf85xmL11fpSqWSHDTTOAOyJie0SsAVqBUwqI\nw2z/Jk+GOXNg4kQ/9Mj6rLyTxJWSnpZ0i6RhadvRwAsl+6xL2/YhabakZknNbW1tOYdqlsGlP6wP\nqyhJSFokaVmZ1wzgJuAYYBKwHvhG+2Fl3qrshPeImBcRTRHR1NjYWEmoZt1TWvrjgQdc+sP6nIpu\npouIKQeyn6TvA/elq+uAUSWbRwIvVRKHWS7KPUp161YYOrToyMyqJs/ZTUeVrF4ILEuXFwIzJTVI\nGguMB36TVxxm3ebSH2a5luX4mqRJJENJzwOfAYiI5ZLuBFYAO4ErImJXjnGYdY9Lf5jllyQi4tL9\nbPsq8NW8PtvMzHqG77g2M7NMThJmZpbJScLMzDI5SZiZWSYnCTMzy+QkYdYVV4O1PsxJwqwrrgZr\nfZifcW3WlcmTkxck1WBvusnVYK3PcE/C7EC5Gqz1Qe5JmB2I0mqwjz7qarDWZ7gnYdaV9mqwzz0H\n8+d3VIM16wOcJMy64mqw1od5uMmsK64Ga32YexJmNe61155i8eJ+LF4sdu/eWXQ4VmecJMxqXGvr\nf0c6qOgwrE45SZjVsLa2n7N9+1oOP/zDRYdidcpJwqxG7d69g9WrP8+4cdfRr9+gvbZ5CMp6ipOE\nWY1av/77DBw4nMMPv4jkKcEAyZOAPQRlPSW3JCHpp5KWpq/nJS1N28dI2lay7ea8YjArTBWKAr7+\n+u/ZuvVxHn10IBs23AbAY48N9xCU9ajckkRE/HlETIqIScAC4O6Szavat0XE5XnFYFaYKhQFHDVq\nDpMnP8nkyU8yfPh0AN773kWZQ1Bm3ZH7fRKSBFwCnJ33Z5n1GlUoCjho0CgGDRoFwIkn/gsAL774\nvT1DUJs23Z/uuQvfEmXdVY1rEqcDGyLiuZK2sZKWSPo3SadnHShptqRmSc1tbW35R2rW06pcFDBr\nCMqsuyr680LSImBEmU3XRsS96fJHgX8u2bYeGB0RmySdBNwj6YSI2KcYTkTMA+YBNDU1+ZZXqy0F\nFAUcNWoORx75cQDWrp3Lpk33MWnS4tw/1+pXRUkiIqbsb7ukAcBFwEklx2wHtqfLLZJWAROA5kpi\nMetV2osCtrbC3Xd3FAUcOjTXjy03BGVWibyHm6YAz0TEuvYGSY2S+qfL44DxwOqc4zCrLhcFtDqR\n99Wsmew91ARwBvBlSTtJrqhdHhF+zJfVFxcFtDqRa5KIiL8s07aAZEqsmZn1cr7j2sz24pIeVspJ\nwsz24pIeVspJwsz2cEkP68xJwsyA/VeVtb7LScLMgP1XlbW+ywVdzAzYu6RHu8ceG84ZZ/yhwKis\naO5JmBlQvqqsS3qYexJmBrikh5XnnoSZmWVykjAzs0xOEmZmlslJwszMMjlJmJlZJicJMzPL5CRh\nVkdcwdV6mpOEWQ3rnBRcwdV6mpOEWQ0rTQovv3xP4RVc3ZOpP04SZjWqc1nv1auvLryCq3sy9afi\nJCHpYknLJe2W1NRp29WSWiU9K+nckvZpaVurpC9UGoNZX1OurPfAgcNpaBjDhg23AhCxvaox+VkU\n9aknehLLgIuAR0sbJU0EZgInANOA70nqL6k/cCNwHjAR+Gi6r5kdoHJlvV977QmWLHn/nn0ee+yI\nqsXjZ1HUr4oL/EXESgBJnTfNAO6I5M+ZNZJagVPSba0RsTo97o503xWVxmLWV5Qr6y0dxMCBhwPw\n5psvMWnSw1WLpzRpbdp0f9q6C9cQrX15XpM4GnihZH1d2pbVvg9JsyU1S2pua2vLLVCzWlOurPdB\nBx3Jscd+i2HDPgTAkCFN+3uLHlWatDZsuA1InkVhte+AkoSkRZKWlXnN2N9hZdpiP+37NkbMi4im\niGhqbGw8kFDN+oRBg0YxdGgTQ4c2ceKJ/8L48TfS0HB0YU+V87Mo6tcB9QUjYko33nsdMKpkfSTw\nUrqc1W5m3VD0U+X8LIr6ledw00JgpqQGSWOB8cBvgCeB8ZLGKpkrNzPd18y6yX/JW14qvqok6ULg\nO0AjcL+kpRFxbkQsl3QnyQXpncAVEbErPeZK4JdAf+CWiFheaRxmfZn/kre8KKLs5YBep6mpKZqb\nm4sOw8yspkhqiYhuz2LwHddmtofLalhnThJmtofLalhnThJmBrishpXnJGFmLqthmZwkzKxsLahq\n3oxnvZcLq5hZ4TfjWe/lnoSZ+WY8y+SehJn5ZjzL5J6EmZllcpIwM7NMThJmZpbJScLMzDI5SZiZ\nWSYnCTMzy+QkYWZmmZwkzMwsk5OEmfUoP5OivjhJmFmP8jMp6ktFSULSxZKWS9otqamk/UOSWiT9\nLv15dsm2xZKelbQ0fR1RSQxm1nv4mRT1p9KexDLgIuDRTu0vAxdExInALOD2Tts/FhGT0tfGCmMw\ns17Az6SoTxUliYhYGRHPlmlfEhEvpavLgUGSGir5LDPr3fxMivpUjWsSHwGWRMT2krYfpUNNfy9J\nVYjBzHJW+kyKDRtuA5JnUlht67JUuKRFwIgym66NiHu7OPYE4DpgaknzxyLiRUlDgAXApcBtGcfP\nBmYDjB49uqtQzaxAo0bN4cgjPw7A2rVz2bTpPj+Tog4oIrreq6s3kRYD/yMimkvaRgKPAJ+MiP+b\ncdxfAk0RcWVXn9HU1BTNzc1d7WZmZiUktUREU9d7lpfLcJOkQ4H7gatLE4SkAZIOT5cHAtNJLn6b\nmVkvVOkU2AslrQNOA+6X9Mt005XAscDfd5rq2gD8UtLTwFLgReD7lcRgZmb56ZHhpmrwcJOZ2VvX\nK4ebzMysPjhJmJlZJicJMzPL5CRhZmaZnCTMzCyTk4SZmWVykjAzs0xOEmZmlslJwszMMjlJmJlZ\nJicJMzPL5CRhZmaZnCTMzCyTk4SZmWVykjAzs0xOEmZmlslJwszMMjlJmJlZJicJMzPLVFGSkHSx\npOWSdktqKmkfI2mbpKXp6+aSbSdJ+p2kVknflqRKYjAzs/xU2pNYBlwEPFpm26qImJS+Li9pvwmY\nDYxPX9MqjMHMzHJSUZKIiJUR8eyB7i/pKGBoRPy/iAjgNuDDlcRgZmb5GZDje4+VtATYCvzPiPg1\ncDSwrmSfdWlbWZJmk/Q6AP4g6YATUhmHAy9XcHwt68vnDj5/n3/G+b/73Rx8ww0c368fTJlCy65d\nVY6sOt5VycFdJglJi4ARZTZdGxH3Zhy2HhgdEZsknQTcI+kEoNz1h8j67IiYB8zrKsYDIak5Ipq6\n3rP+9OVzB5+/zz/7/Bcv1mJgHNCwaBGnnnVW7KxqcFUgqbmS47tMEhEx5a2+aURsB7anyy2SVgET\nSHoOI0t2HQm89Fbf38ysUosX60LgncA9wJ8XHE6vlcsUWEmNkvqny+NILlCvjoj1wGuSTk1nNX0C\nyOqNmJnlYvFiDQSuAz4PvFFwOL1apVNgL5S0DjgNuF/SL9NNZwBPS/otcBdweUS8km77a+AHQCuw\nCvjXSmJ4C3pk2KpG9eVzB5+/z39flwGbgLvpGAbvX7WIqqui71/JJCMzs75j8WL9E3BVp+Y/nnVW\nDC4int7Md1ybWV/0DeDk9HVf2nZWYdH0Yu5JmJlZprrrSUj6uqRnJD0t6eeSDi3ZdnVaDuRZSeeW\ntE9L21olfaGYyPNRz+cGIGmUpF9JWpmWiLkqbT9M0kOSnkt/DkvblZaDaU3/G5lc7Bn0DEn9JS2R\ndF+6PlbSE+n5/1TSQWl7Q7remm4fU2TcPUHSoZLuSv/dr5R0Wl/6/iX9t/S//WWS/lnSoJ78/usu\nSQAPAe+OiPcAvweuBpA0EZgJnEBSCuR76T+s/sCNwHnAROCj6b41r57PrcROYE5EHA+cClyRnuMX\ngIcjYjzwcLoOye+ivSTMbJIyMfXgKmBlyfp1wLfS838V+FTa/ing1Yg4FvhWul+tuwF4ICKOA95L\n8nvoE9+/pKOBzwJNEfFukovvM+nB77/ukkREPBix54aYx+m4L2MGcEdEbI+INSSzq05JX60RsToi\n3gTuSPetB/V8bgBExPqIeCpdfo3kfxBHk5znrelut9JR/mUGcFskHgcOTcvF1CxJI4H/SjJrkHR6\n+dkkMwth3/Nv/73cBZxTy0U2JQ0lmU35Q4CIeDMiNtOHvn+S+93eJmkAcDDJzcw99v3XXZLo5K/o\nmGJ7NPBCybb2kiBZ7fWgns9tH2nX+X3AE8CR6X05pD+PSHerx9/JPwGfA3an68OBzSV/LJWe457z\nT7dvSfevVeOANuBH6XDbDyQdQh/5/iPiReB64D9IksMWoIUe/P5rMklIWpSOv3V+zSjZ51qSoYj5\n7U1l3ir2014P6vnc9iJpMLAA+NuI2Lq/Xcu01ezvRNJ0YGNEtJQ2l9k1DmBbLRoATAZuioj3AX+k\nY2ipnLo6//RaywxgLPAO4BCSIbXOuv3951ngLzddlQqRNAuYDpwTHdO31gGjSnYrLQmS1V7r9nfO\ndUPSQJIEMT8i7k6bN0g6KiLWp8MJG9P2evud/Anwp5LOBwYBQ0l6FodKGpD+tVh6ju3nvy4dnng7\n8Mq+b1sz1gHrIuKJdP0ukiTRV77/KcCaiGgDkHQ38AF68PuvyZ7E/kiaRnKr/Z9GxOslmxYCM9Or\n+2NJLlz9BngSGJ/OBjiI5KLPwmrHnZN6Pjdgz/j7D4GVEfHNkk0LgVnp8iw6yr8sBD6RznI5FdjS\nPixRiyLi6ogYGRFjSL7fRyLiY8CvgD9Ld+t8/u2/lz9L96/Zv6Qj4j+BFyS1Vzo9B1hBH/n+SYaZ\nTpV0cPpvof38e+77j4i6epFckH4BWJq+bi7Zdi1JKZBngfNK2s8nmQm1iqS6beHn0YO/j7o9t/T8\n/gtJd/npku/8fJJx1oeB59Kfh6X7i2TG1yrgdySzQgo/jx76XZwF3JcujyP5I6gV+BnQkLYPStdb\n0+3jio67B857EtCc/jdwDzCsL33/wFzgGZKHwN0ONPTk9++b6czMLFPdDTeZmVnPcZIwM7NMThJm\nZpbJScLMzDI5SZiZWSYnCTMzy+QkYWZmmf4/EJ2bXhd32RkAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x257c874a588>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#显示聚类结果\n",
    "#画出聚类结果，每一类用一种颜色，为方便查看，只看四个分类\n",
    "colors = ['b','g','r','k','y']\n",
    "\n",
    "n_clusters = 5\n",
    "mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)\n",
    "mb_kmeans.fit(train_pca)\n",
    "\n",
    "y_train_pred = mb_kmeans.labels_\n",
    "cents = mb_kmeans.cluster_centers_#质心\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    index = np.nonzero(y_train_pred==i)[0]\n",
    "    x1 = train_pca[index,0]\n",
    "    x2 = train_pca[index,1]\n",
    "    y_i = y_train_pred[index]\n",
    "    for j in range(len(x1)):\n",
    "        if j < 10:  #每类打印20个\n",
    "            plt.text(x1[j],x2[j],str(int(y_i[j])),color=colors[i],\\\n",
    "                fontdict={'weight': 'bold', 'size': 9})\n",
    "    #plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)\n",
    "    plt.axis([-250,800,-150,50])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
